Commit 4e49bd2e authored by Paulo Medeiros's avatar Paulo Medeiros
Browse files

Bugfixes. Rename "--selected-stations-fpath" opt.

Changed:
    * Rename the "--selected-stations" option in the csv2obsoul
         command to "--selected-stations-fpath"
    * Make invalid selected_stations_fpath raise error

Fixed:
    * Crash in --selected-stations-fpath opt
    * Crash in csv2obsoul command
    * Crash in clustering & select when no station gets accepted
        in the clustering.

This is a merge commit.
parents 16623341 37c1d389
Pipeline #9239 passed with stages
in 1 minute and 17 seconds
......@@ -184,7 +184,7 @@ def get_parsed_args(program_name):
metavar="data_column=fillna_value_for_col",
)
parser_csv2obsoul.add_argument(
"--selected_stations",
"--selected-stations-fpath",
metavar="STATIONS_FILE_PATH",
default=None,
type=Path,
......
......@@ -585,13 +585,16 @@ def report_clustering_results(df):
df (pandas.DataFrame): Dataframe with clustering info.
"""
if len(df.index) == 0:
logger.info("Dataframe has no rows. No results to report.")
return
n_obs = len(df.index)
noise_data_df = df[df["cluster_label"] < 0]
n_noise_clusters = len(noise_data_df["cluster_label"].unique())
noise_count = len(noise_data_df)
n_clusters = len(df["cluster_label"].unique()) - n_noise_clusters
n_accepted = n_obs - noise_count
silhouette_score = df["silhouette_score"].mean(skipna=True)
logger.info("Number of obs passed to the clustering routine: %d", n_obs)
logger.info("Estimated number of clusters: %d", n_clusters)
......@@ -601,4 +604,8 @@ def report_clustering_results(df):
noise_count,
100.0 * noise_count / n_obs,
)
logger.info("Mean silhouette score: %.3f", silhouette_score)
try:
silhouette_score = df["silhouette_score"].mean(skipna=True)
logger.info("Mean silhouette score: %.3f", silhouette_score)
except KeyError:
pass
......@@ -287,6 +287,16 @@ def select_stations(args):
len(df_unique_moving),
)
if len(df_accepted.index) == 0:
logger.info(
"%sSelection finished. No stations were accepted. "
"%d stations were rejected.%s",
logcolor.cyan,
len(df_rejected.index),
logcolor.reset,
)
return
##########################################################################
# Trim accepted obs to (a possibly coarser version of) the domain's grid #
##########################################################################
......@@ -308,13 +318,14 @@ def select_stations(args):
n_preliminary_accepted = len(df_accepted.index)
# (a) Add rejection rate info
@np.vectorize
def rej_rate(stat_id):
def rej_rate_non_vec(stat_id):
try:
return id2rejection_rate[stat_id]
except KeyError:
return 0.0
rej_rate = np.vectorize(rej_rate_non_vec, otypes=[float])
df_accepted["rejection_rate"] = rej_rate(df_accepted["id"])
# (b) Sort by rejection rate (lower to higher)
......@@ -445,20 +456,21 @@ def csv2obsoul(args):
# Allow mkdir to raise eventual exceptions if cannot write to outdir
outdir.mkdir(parents=True)
if args.selected_stations is not None:
if str(args.selected_stations).endswith(".csv"):
selected_stations = pd.read_csv(args.selected_stations)["id"]
# Allow picking only selected stations
selected_stations = None
if args.selected_stations_fpath is not None:
if args.selected_stations_fpath.suffix == ".csv":
selected_stations = pd.read_csv(args.selected_stations_fpath)["id"]
logger.info(
"Read %s stations from file '%s'",
"Found %s selected stations in file '%s'.",
len(selected_stations),
args.selected_stations,
args.selected_stations_fpath,
)
else:
logger.warning(
"Only csv files supported. Skipping file '%s'",
args.selected_stations,
raise NotImplementedError(
'Only csv files supported in "--selected-stations-fpath". '
"Received '%s'." % (args.selected_stations_fpath),
)
selected_stations = None
netatmoqc_input2output(
config.general.dtgs,
......
......@@ -554,6 +554,8 @@ class Domain:
pandas.Dataframe: Trimmed dataframe.
"""
if len(df.index) == 0:
return df
trimmed_df = df[self.contains_lonlat(df.lon, df.lat)].copy()
trimmed_df = trimmed_df.reset_index(drop=True)
return trimmed_df
......
......@@ -75,6 +75,11 @@ class DataFrameWithDtgMetadata(pd.DataFrame):
common_keys = set()
for dic in md_dicts_to_combine:
if dic == {}:
# If any dict is empty, then there will be no
# common keys at all.
common_keys = set()
break
common_keys = common_keys.union(set(dic))
combined_md_dict = {}
......@@ -496,8 +501,11 @@ def remove_irregular_stations(
Default value = True)
Returns:
'pandas.Dataframe': Copy of input df, but with the
irregular. stations removed.
{
'pandas.Dataframe': Copy of input df, but with the
irregular. stations removed,
'pandas.Dataframe': Dataframe containing removed movind stations.
}
"""
dtg = df.metadata_dict["dtg"]
......@@ -512,6 +520,7 @@ def remove_irregular_stations(
dtg,
)
else:
df_moving_stations = pd.DataFrame.empty
logger.warning("Not checking for moving stations")
if duplicates:
......@@ -526,4 +535,4 @@ def remove_irregular_stations(
df.metadata_dict["dtg"] = dtg
return (df, df_moving_stations) if moving else df
return df, df_moving_stations
......@@ -7,7 +7,7 @@
[tool.poetry]
name = "netatmoqc"
version = "0.3.4.dev3"
version = "0.3.4.dev4"
description = "Use machine learning clustering methods to perform quality control over NetAtmo data"
authors = [
"Paulo V. C. Medeiros <paulo.medeiros@smhi.se>"
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment