Commit 2193ff60 authored by Paulo Medeiros's avatar Paulo Medeiros
Browse files

Some more refactoring to pass config as argument

parent b4cab6fe
......@@ -250,20 +250,7 @@ def run_clustering_on_df(
config=config,
db=db,
outlier_rm_method=outlier_rm_method,
# Args that apply only to LOF
distance_matrix=distance_matrix,
# Args that only apply for the iterative method
max_num_refine_iter=config.get_clustering_opt(
"outlier_removal.{}.max_n_iter".format(outlier_rm_method)
),
max_n_stdev_around_mean=config.get_clustering_opt(
"outlier_removal.{}.max_n_stdev".format(outlier_rm_method)
),
weights_dict=config.get_clustering_opt("obs_weights"),
method=method,
eps=config.get_clustering_opt("eps"),
min_cluster_size=config.get_clustering_opt("min_cluster_size"),
min_samples=config.get_clustering_opt("min_samples"),
n_jobs=n_jobs,
reclustering_function=self_consistent_reclustering,
)
......
......@@ -348,7 +348,7 @@ def calc_distance_matrix(df, config, num_threads=-1):
return calc_distance_matrix_haversine(
# Drop columns that won't be used in the clustering
df=df,
weights=weights_dict_to_np_array(df, weights_dict),
weights=weights_dict_to_np_array(df, config=config),
optimize_mode=config.general.custom_metrics_optimize_mode,
num_threads=num_threads,
)
......
......@@ -201,14 +201,7 @@ def _filter_outliers_iterative(
return df[:, -1]
def filter_outliers_iterative(
df,
config,
weights_dict,
trunc_perc=0.25,
max_num_refine_iter=1000,
max_n_stdev_around_mean=2.0,
):
def filter_outliers_iterative(df, config, trunc_perc=0.25):
"""Filter outliers using an iterative method.
For each obs type (column) in the dataframe, calculate the mean value
......@@ -220,15 +213,9 @@ def filter_outliers_iterative(
Args:
df (pandas.Dataframe): Dataframe containing clustering info.
config (ParsedConfig): Parsed configs.
weights_dict (dict): Weights chosen for each observation parameter.
trunc_perc (float): Proportion of array elements to be removed for the
calculation of the truncated stds and means. Should lie between
0 and 1. (Default value = 0.25).
max_num_refine_iter (int): Max number of iterations performed.
(Default value = 1000).
max_n_stdev_around_mean (float): Max allowed number of stdevs around
the mean for an observation to be considered acceptable.
(Default value = 2.0).
Returns:
pandas.Dataframe: Copy of input data, with clustering labels reassigned
......@@ -237,10 +224,14 @@ def filter_outliers_iterative(
"""
df["cluster_label"] = _filter_outliers_iterative(
df.drop(config.general.unclusterable_data_columns, axis=1),
max_num_iter=max_num_refine_iter,
max_n_stdev_around_mean=max_n_stdev_around_mean,
max_num_iter=config.get_clustering_opt(
"outlier_removal.iterative.max_n_iter"
),
max_n_stdev_around_mean=config.get_clustering_opt(
"outlier_removal.iterative.max_n_stdev"
),
trunc_perc=trunc_perc,
weights=weights_dict_to_np_array(df, weights_dict),
weights=weights_dict_to_np_array(df, config=config),
).astype(int)
return df
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment