Commit 7f378cf4 authored by Paulo Medeiros's avatar Paulo Medeiros
Browse files

Fix a few crashes in outlier removal methods

Fixes issue #5
parent 2193ff60
Pipeline #9774 failed with stage
in 0 seconds
......@@ -134,7 +134,7 @@ def run_clustering_on_df(
df,
config,
distance_matrix=None,
outlier_rm_method="config",
remove_outliers=True,
calc_silhouette_samples=True,
n_jobs=-1,
):
......@@ -145,9 +145,8 @@ def run_clustering_on_df(
config (netatmoqc.config.ParsedConfig): Program's general configs.
distance_matrix (HollowSymmetricMatrix): Obs distance matrix.
Default value = None.
outlier_rm_method (str): Outlier removal method.
Default value = "config". This means that the default will be
retrieved from the corresponding value in the config argument.
remove_outliers (bool): Use a post-clustering outlier removal method?
Default value = True.
calc_silhouette_samples (bool): Calculate or not silhouette_samples?
Default value = True.
n_jobs (int): Max number of local-host parallel jobs.
......@@ -166,6 +165,7 @@ def run_clustering_on_df(
raise NotImplementedError('Method "{}" not available.'.format(method))
if len(df.index) == 0:
logger.warning("Dataframe has no rows")
df["cluster_label"] = None
return df
# We will not do any df = StandardScaler().fit_transform(df),
......@@ -239,17 +239,14 @@ def run_clustering_on_df(
# expects 'cluster_label' to be the last column in the dataframe
df["cluster_label"] = db.labels_
# Refine clustering if requested
# It is important to have 'cluster_label' as the last column
# when running the iterative refine routine
if outlier_rm_method == "config":
outlier_rm_method = config.get_clustering_opt("outlier_removal.method")
if outlier_rm_method:
if remove_outliers:
# Refine clustering if requested
# It is important to have 'cluster_label' as the last column
# when running the iterative refine routine
df = filter_outliers(
df,
config=config,
db=db,
outlier_rm_method=outlier_rm_method,
distance_matrix=distance_matrix,
n_jobs=n_jobs,
reclustering_function=self_consistent_reclustering,
......@@ -322,7 +319,7 @@ def self_consistent_reclustering(df, config, distance_matrix, **kwargs):
df_rec = run_clustering_on_df(
df,
config=config,
outlier_rm_method=None,
remove_outliers=False,
distance_matrix=distance_matrix.subspace(i_valid_obs),
calc_silhouette_samples="silhouette_score" in df.columns,
**kwargs,
......@@ -352,11 +349,7 @@ def _cluster_netatmo_obs_one_domain(df, config, **kwargs):
"""
time_start_clustering = time.time()
logger.debug("Performing clustering...")
df = run_clustering_on_df(
df=df,
config=config,
**kwargs,
)
df = run_clustering_on_df(df=df, config=config, **kwargs)
time_end_clustering = time.time()
logger.debug(
"Done with clustering. Elapsed: %.2fs",
......
......@@ -210,6 +210,9 @@ def filter_outliers_iterative(df, config, trunc_perc=0.25):
column's mean. Repeat the process untill either no data is removed or
the max number of iterations is reached.
It is important to have 'cluster_label' as the last column when running
this method.
Args:
df (pandas.Dataframe): Dataframe containing clustering info.
config (ParsedConfig): Parsed configs.
......@@ -295,6 +298,8 @@ def get_local_outlier_factors(df, distance_matrix, calc_per_cluster=False):
all_lof_values[indices] = clf.negative_outlier_factor_
else:
indices = df.index[df["cluster_label"] > -1]
if len(indices) == 0:
return all_lof_values
clf = LocalOutlierFactor(n_neighbors=3, metric="precomputed")
clf.fit_predict(distance_matrix.subspace(indices))
all_lof_values[indices] = clf.negative_outlier_factor_
......@@ -322,16 +327,16 @@ def filter_outliers_lof(df, distance_matrix):
# Higher-level outlier removal routine calling the specific ones defined above
def filter_outliers(df, db, outlier_rm_method, distance_matrix, **kwargs):
def filter_outliers(df, db, config, distance_matrix, **kwargs):
"""Filter outliers according to specified outlier detection method.
Args:
df (pandas.Dataframe): Input data with clustering info.
db (obj): Output of clustering method.
outlier_rm_method (str): Outlier detection method of choice.
config (netatmoqc.config.ParsedConfig): Program's general configs.
distance_matrix (netatmoqc.metrics.HollowSymmetricMatrix): Distance
matrix consistent with the input data.
**kwargs: Passed on to internalm wrapped routines.
**kwargs: Passed on to internal wrapped routines.
Returns:
pandas.Dataframe: Copy of input data, with clustering labels for
......@@ -339,6 +344,8 @@ def filter_outliers(df, db, outlier_rm_method, distance_matrix, **kwargs):
"""
tstart = time.time()
outlier_rm_method = config.get_clustering_opt("outlier_removal.method")
logger.debug(
' > Running outlier removal method "%s" with kwargs=%s',
outlier_rm_method,
......@@ -353,9 +360,11 @@ def filter_outliers(df, db, outlier_rm_method, distance_matrix, **kwargs):
rtn = filter_outliers_lof(df, distance_matrix=distance_matrix)
elif outlier_rm_method == "reclustering":
func = kwargs.pop("reclustering_function")
rtn = func(df, distance_matrix, **kwargs)
rtn = func(
df, config=config, distance_matrix=distance_matrix, **kwargs
)
else:
rtn = filter_outliers_iterative(df, **kwargs)
rtn = filter_outliers_iterative(df, config=config)
logger.debug(
" * Done with outlier removal. Elapsed: %.1fs",
time.time() - tstart,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment