Commit 3a6c7250 authored by Paulo Medeiros's avatar Paulo Medeiros
Browse files

Add unclusterable_data_columns config option

Replacing the hardcoded "skip" lists. This is clearer and more useful.
parent 336e43ea
......@@ -134,7 +134,6 @@ def run_clustering_on_df(
df,
config,
distance_matrix=None,
skip=("id", "time_utc"),
outlier_rm_method="config",
calc_silhouette_samples=True,
n_jobs=-1,
......@@ -146,8 +145,6 @@ def run_clustering_on_df(
config (netatmoqc.config.ParsedConfig): Program's general configs.
distance_matrix (HollowSymmetricMatrix): Obs distance matrix.
Default value = None.
skip (tuple): Columns to skip in df.
Default value = ("id", "time_utc").
outlier_rm_method (str): Outlier removal method.
Default value = "config". This means that the default will be
retrieved from the corresponding value in the config argument.
......@@ -180,7 +177,7 @@ def run_clustering_on_df(
# jit-compilation via numba, then the relative speed up can reach
# up to 120x.
distance_matrix = calc_distance_matrix(
df=df, config=config, skip=skip, num_threads=n_jobs
df=df, config=config, num_threads=n_jobs
)
# Running clustering with the computed distance matrix
......@@ -250,12 +247,12 @@ def run_clustering_on_df(
if outlier_rm_method:
df = filter_outliers(
df,
config=config,
db=db,
outlier_rm_method=outlier_rm_method,
# Args that apply only to LOF
distance_matrix=distance_matrix,
# Args that only apply for the iterative method
skip=skip,
max_num_refine_iter=config.get_clustering_opt(
"outlier_removal.{}.max_n_iter".format(outlier_rm_method)
),
......
......@@ -390,6 +390,10 @@ with config_section("general") as section:
default="haversine",
choices=["correlation", "haversine"],
)
# Data cols to ignore when running clustering
config_metadata.register(
"unclusterable_data_columns", default=["id", "time_utc"]
)
# Data cols to export when saving obsoul output
config_metadata.register(
"obsoul_export_params",
......
......@@ -33,9 +33,7 @@ def haversine_distance(point1, point2):
return rtn
def weights_dict_to_np_array(
df, pairwise_diff_weights=None, skip=("id", "time_utc"), default=1
):
def weights_dict_to_np_array(df, config, default=1):
"""Convert pairwise_diff_weights into a numpy array.
Takes a pandas dataframe and a {column_name:weight} dictionary and returns
......@@ -48,11 +46,8 @@ def weights_dict_to_np_array(
Args:
df (pandas.Dataframe): Dataframe with observations.
pairwise_diff_weights (dict): {df_column_name:weight} dictionary.
Default value = None.
skip: df columns that will not enter the clustering and should
therefore be skipped. Default value = ("id", "time_utc")
default: Default weight to be assigned for a non-skipped df column if
config (ParsedConfig): Parsed configs.
default: Default weight to be assigned to a non-skipped df column if
the column name is present in df but not in pairwise_diff_weights.
Default value = 1.
......@@ -69,10 +64,12 @@ def weights_dict_to_np_array(
raise ValueError("'lat' column is not followed by 'lon' column")
weights = []
weights_dict = config.get_clustering_opt("obs_weights")
unclusterable_cols = config.general.unclusterable_data_columns
col2weight = {c: ("geo_dist" if c == "lon" else c) for c in df.columns}
for col in df.columns[~df.columns.isin(list(skip) + ["lat"])]:
for col in df.columns[~df.columns.isin(unclusterable_cols + ["lat"])]:
try:
weights.append(pairwise_diff_weights[col2weight[col]])
weights.append(weights_dict[col2weight[col]])
except (KeyError, TypeError):
weights.append(default)
return np.array(weights, dtype=np.float64)
......@@ -295,9 +292,7 @@ def calc_distance_matrix_considering_correlation(
df.insert(0, "y", yvals / 1000.0)
df.insert(0, "x", xvals / 1000.0)
cols_to_skip = ["id", "time_utc"]
cols_to_skip += ["lon", "lat"]
selected_data_columns = [c for c in df.columns if c not in cols_to_skip]
selected_data_columns = [c for c in df.columns if c not in ["lon", "lat"]]
df = df[selected_data_columns]
weights_array = np.array([weights_dict.get(c, 1.0) for c in df.columns])
......@@ -316,12 +311,7 @@ def calc_distance_matrix_considering_correlation(
return rtn
def calc_distance_matrix(
df,
config,
skip=None,
num_threads=-1,
):
def calc_distance_matrix(df, config, num_threads=-1):
"""Calculate distance matrix between obs in dataframe df.
Spatial distances are calculated by projecting (lon, lat) into (x, y)
......@@ -332,11 +322,8 @@ def calc_distance_matrix(
Args:
df (pandas.Dataframe): Input data.
config (ParsedConfig): Parsed configs.
skip (list): List of datafreme columns to skip/ignore.
num_threads: Max number of threads used for the computation.
(Default value = -1)
method: The method used in the calculation of the distances.
(Default value = "haversine")
Raises:
NotImplementedError: If the passed method is not supported.
......@@ -356,10 +343,11 @@ def calc_distance_matrix(
logger.debug("Computing distance matrix using the '%s' method", method)
weights_dict = config.get_clustering_opt("obs_weights")
df = df.copy().drop(config.general.unclusterable_data_columns, axis=1)
if method == "haversine":
return calc_distance_matrix_haversine(
# Drop columns that won't be used in the clustering
df=df.drop(list(skip), axis=1),
df=df,
weights=weights_dict_to_np_array(df, weights_dict),
optimize_mode=config.general.custom_metrics_optimize_mode,
num_threads=num_threads,
......
......@@ -203,7 +203,7 @@ def _filter_outliers_iterative(
def filter_outliers_iterative(
df,
skip,
config,
weights_dict,
trunc_perc=0.25,
max_num_refine_iter=1000,
......@@ -219,8 +219,7 @@ def filter_outliers_iterative(
Args:
df (pandas.Dataframe): Dataframe containing clustering info.
skip (list): List of names of columns in the df to be skipped (i.e.,
not taken into account in the filtering).
config (ParsedConfig): Parsed configs.
weights_dict (dict): Weights chosen for each observation parameter.
trunc_perc (float): Proportion of array elements to be removed for the
calculation of the truncated stds and means. Should lie between
......@@ -237,7 +236,7 @@ def filter_outliers_iterative(
"""
df["cluster_label"] = _filter_outliers_iterative(
df.drop(list(skip), axis=1),
df.drop(config.general.unclusterable_data_columns, axis=1),
max_num_iter=max_num_refine_iter,
max_n_stdev_around_mean=max_n_stdev_around_mean,
trunc_perc=trunc_perc,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment