Commit c3a3730a authored by Paulo Medeiros's avatar Paulo Medeiros
Browse files

New metrics calc methods. Other minor changes.

Summary of main changes below:

- New metrics cal methods:
    - correlation_aware_euclidean (the new default)
    - haversine_plus_euclidean
    - haversine_plus_manhattan (the only one implemented previously)
- Add "unclusterable_data_columns" general config option
- Allow choice of HDBSCAN's cluster_selection_method
- Metrics has its own section in config file
- Support to weights for dists along proj x and y
- Used and visualised proj params more compatible
- Remove unused "tstep" from domain configs
- Fix a few crashes in outlier removal methods (addresses issue #5)
- Fix a few warnings
parents c615173b 35621ffd
Pipeline #10369 failed with stages
in 0 seconds
...@@ -46,14 +46,20 @@ ...@@ -46,14 +46,20 @@
# Note that the interval is closed at the left and open at the right. # Note that the interval is closed at the left and open at the right.
dtgs.cycle_length = '3H' # Default = '3H' dtgs.cycle_length = '3H' # Default = '3H'
# #
# custom_metrics_optimize_mode:
[metrics]
# method:
# > default: "correlation_aware_euclidean"
# > choices: "correlation_aware_euclidean", "haversine_plus_manhattan"
method = "correlation_aware_euclidean"
# optimize_mode:
# > default: "memory" # > default: "memory"
# > choices: "memory", "speed_mem_compromise", "speed" # > choices: "memory", "speed_mem_compromise", "speed"
# In terms of memory usage: # In terms of memory usage:
# "memory" < "speed_mem_compromise" < "speed" # "memory" < "speed_mem_compromise" < "speed"
# In terms of execution time: # In terms of execution time:
# "memory" > "speed_mem_compromise" > "speed" # "memory" > "speed_mem_compromise" > "speed"
custom_metrics_optimize_mode = "memory" optimize_mode = "memory"
################################## ##################################
# Options controlling the domain # # Options controlling the domain #
...@@ -64,7 +70,6 @@ ...@@ -64,7 +70,6 @@
# <https://hirlam.org/trac/wiki/HarmonieSystemDocumentation/ModelDomain> # <https://hirlam.org/trac/wiki/HarmonieSystemDocumentation/ModelDomain>
# <https://hirlam.org/trac/browser/Harmonie/scr/Harmonie_domains.pm> # <https://hirlam.org/trac/browser/Harmonie/scr/Harmonie_domains.pm>
name = "" name = ""
tstep = 75
nlon = 900 nlon = 900
nlat = 960 nlat = 960
lonc = 16.763011639 lonc = 16.763011639
......
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Common definitions.""" """Common definitions."""
import numba
try: try:
# From python3.8 # From python3.8
from importlib.metadata import version from importlib.metadata import version
...@@ -11,3 +13,8 @@ try: ...@@ -11,3 +13,8 @@ try:
__version__ = version(__name__) __version__ = version(__name__)
except ModuleNotFoundError: except ModuleNotFoundError:
__version__ = "?" __version__ = "?"
# Set the threading layer before any parallel target compilation.
# Picking "omp" specifically to avoid warnings about old TBB versions.
# See <http://numba.pydata.org/numba-doc/latest/user/threading-layer.html>
numba.config.THREADING_LAYER = "omp"
...@@ -263,6 +263,34 @@ def generate_control_card(): ...@@ -263,6 +263,34 @@ def generate_control_card():
), ),
html.Br(), html.Br(),
# #
html.Div(
id="metrics_method_div",
children=[
html.Br(),
html.P("Metrics Calculation Method"),
dcc.Dropdown(
id="metrics_method",
options=[
{
"label": "Correlation-Aware Euclidean",
"value": "correlation_aware_euclidean",
},
{
"label": "Haversine + Manhattan",
"value": "haversine_plus_manhattan",
},
{
"label": "Haversine + Euclidean",
"value": "haversine_plus_euclidean",
},
],
value="correlation_aware_euclidean",
),
],
style={"display": "block", "text-align": "center"},
),
html.Br(),
#
html.Div( html.Div(
id="optionals_div", id="optionals_div",
children=[ children=[
...@@ -604,6 +632,7 @@ def show_hide_max_num_refining_iter(outlier_rm_method): ...@@ -604,6 +632,7 @@ def show_hide_max_num_refining_iter(outlier_rm_method):
State("eps", "value"), State("eps", "value"),
State("date-picker-select", "date"), State("date-picker-select", "date"),
State("cycle-select", "value"), State("cycle-select", "value"),
State("metrics_method", "value"),
State("outlier_rm_method", "value"), State("outlier_rm_method", "value"),
State("max_num_refine_iter", "value"), State("max_num_refine_iter", "value"),
State("max_n_std_around_mean", "value"), State("max_n_std_around_mean", "value"),
...@@ -624,6 +653,7 @@ def run_clustering_and_make_plot( ...@@ -624,6 +653,7 @@ def run_clustering_and_make_plot(
eps, eps,
date, date,
cycle, cycle,
metrics_method,
outlier_rm_method, outlier_rm_method,
max_num_refining_iter, max_num_refining_iter,
refine_max_std, refine_max_std,
...@@ -660,12 +690,15 @@ def run_clustering_and_make_plot( ...@@ -660,12 +690,15 @@ def run_clustering_and_make_plot(
{ {
"general": dict( "general": dict(
clustering_method=method, clustering_method=method,
custom_metrics_optimize_mode=config.general.custom_metrics_optimize_mode,
dtgs=dict( dtgs=dict(
list=config.general.dtgs, list=config.general.dtgs,
cycle_length=config.general.dtgs.cycle_length.freqstr, cycle_length=config.general.dtgs.cycle_length.freqstr,
), ),
), ),
"metrics": dict(
method=metrics_method,
optimize_mode=config.metrics.optimize_mode,
),
"clustering_method.%s" "clustering_method.%s"
% (method): dict( % (method): dict(
eps=eps, eps=eps,
......
...@@ -89,51 +89,6 @@ def sort_df_by_cluster_size(df): ...@@ -89,51 +89,6 @@ def sort_df_by_cluster_size(df):
return df.drop("parent_cluster_size", axis=1).reset_index(drop=True) return df.drop("parent_cluster_size", axis=1).reset_index(drop=True)
def weights_dict_to_np_array(
df, pairwise_diff_weights=None, skip=("id", "time_utc"), default=1
):
"""Convert pairwise_diff_weights into a numpy array.
Takes a pandas dataframe and a {column_name:weight} dictionary and returns
an array of weights to be passed to the routine that calculates the
distance matrix.
Columns "lat" and "lon" in df are treated specially, in that they are
not assigned a weight individually, but rather a single weight gets
assigned to the "geo_dist" property.
Args:
df (pandas.Dataframe): Dataframe with observations.
pairwise_diff_weights (dict): {df_column_name:weight} dictionary.
Default value = None.
skip: df columns that will not enter the clustering and should
therefore be skipped. Default value = ("id", "time_utc")
default: Default weight to be assigned for a non-skipped df column if
the column name is present in df but not in pairwise_diff_weights.
Default value = 1.
Returns:
numpy.ndarray: weights to be passed to the routine that calculates the
distance matrix.
Raises:
ValueError: If the dataframe 'lat' column is not followed by the 'lon'
column.
"""
if df.columns.get_loc("lon") - df.columns.get_loc("lat") != 1:
raise ValueError("'lat' column is not followed by 'lon' column")
weights = []
col2weight = {c: ("geo_dist" if c == "lon" else c) for c in df.columns}
for col in df.columns[~df.columns.isin(list(skip) + ["lat"])]:
try:
weights.append(pairwise_diff_weights[col2weight[col]])
except (KeyError, TypeError):
weights.append(default)
return np.array(weights, dtype=np.float64)
def get_silhouette_samples(df, distance_matrix): def get_silhouette_samples(df, distance_matrix):
"""Calculate silhouette scores for every obs in df. """Calculate silhouette scores for every obs in df.
...@@ -177,50 +132,27 @@ def get_silhouette_samples(df, distance_matrix): ...@@ -177,50 +132,27 @@ def get_silhouette_samples(df, distance_matrix):
def run_clustering_on_df( def run_clustering_on_df(
df, df,
method="hdbscan", config,
domain,
distance_matrix=None, distance_matrix=None,
distance_matrix_optimize_mode="memory", remove_outliers=True,
skip=("id", "time_utc"),
weights_dict=None,
eps=15, # eps applies only to dbscan
min_cluster_size=3, # min_cluster_size applies only to hdbscan
min_samples=3,
n_jobs=-1,
outlier_rm_method=None,
max_num_refine_iter=50,
max_n_stdev_around_mean=2.0,
trunc_perc=0.25,
calc_silhouette_samples=True, calc_silhouette_samples=True,
n_jobs=-1,
): ):
"""Low-level clustering routine. """Low-level clustering routine.
Args: Args:
df (pandas.Dataframe): Dataframe with observations. df (pandas.Dataframe): Dataframe with observations.
method (str): {"hdbscan", "dbscan", "rsl", "optics"} config (netatmoqc.config.ParsedConfig): Program's general configs.
Clustering method. domain (netatmoqc.domains.Domain): The adopted spatial domain.
distance_matrix (HollowSymmetricMatrix): Obs distance matrix. distance_matrix (HollowSymmetricMatrix): Obs distance matrix.
Default value = None. Default value = None.
distance_matrix_optimize_mode (str): The distance matrix optimization remove_outliers (bool): Use a post-clustering outlier removal method?
mode. Default value = "memory". Default value = True.
skip (tuple): Columns to skip in df.
Default value = ("id", "time_utc").
weights_dict (dict): obs_name --> obs_weight map. Default value = None.
eps (float): DBSCAN's eps parameter. Default value = 15.0.
min_cluster_size (int): HDBSCAN's min_cluster_size. Default value = 3.
min_samples (int): (H)DBSCAN's min_samples. Default value = 3.
n_jobs (int): Max number of local-host parallel jobs.
Default value = -1.
outlier_rm_method (str): Outlier removal method. Default value = None.
max_num_refine_iter (int): Max number of iterations for the
"iterative" ourlier removal method. Default value = 50.
max_n_stdev_around_mean (float): Max number of stdev from cluster obs
mean for an observation to be considered OK in the "iteractive"
outlier removal method. Default value = 2.0.
trunc_perc (float): Percentage used in array truncation when
calculating stdevs and means in the "iterative" outlier removal
method. Default value = 0.25.
calc_silhouette_samples (bool): Calculate or not silhouette_samples? calc_silhouette_samples (bool): Calculate or not silhouette_samples?
Default value = True. Default value = True.
n_jobs (int): Max number of local-host parallel jobs.
Default value = -1.
Returns: Returns:
pandas.DataFrame: Copy of df with clustering added info. pandas.DataFrame: Copy of df with clustering added info.
...@@ -229,35 +161,25 @@ def run_clustering_on_df( ...@@ -229,35 +161,25 @@ def run_clustering_on_df(
NotImplementedError: If the `method` choice is not valid. NotImplementedError: If the `method` choice is not valid.
""" """
method = method.lower() method = config.general.clustering_method.lower()
# Compute clustering using DBSCAN or HDBSCAN # Compute clustering using DBSCAN or HDBSCAN
if method not in ["dbscan", "hdbscan", "rsl", "optics"]: if method not in ["dbscan", "hdbscan", "rsl", "optics"]:
raise NotImplementedError('Method "{}" not available.'.format(method)) raise NotImplementedError('Method "{}" not available.'.format(method))
if len(df.index) == 0: if len(df.index) == 0:
logger.warning("Dataframe has no rows") logger.warning("Dataframe has no rows")
df["cluster_label"] = None
return df return df
# Set weights to be used in the metrics for the various
# generalised distances. The distances used in the metrics
# will be used_dist(i) = weight(i)*real_dist(i)
weights = weights_dict_to_np_array(df, weights_dict)
# We will not do any df = StandardScaler().fit_transform(df), # We will not do any df = StandardScaler().fit_transform(df),
# as we'll use a metric based on earth distances # as we'll use a metric based on earth distances
if distance_matrix is None: if distance_matrix is None:
# My tests indicate that passing a pre-computed distance matrix to # My tests indicate that passing a pre-computed distance matrix to
# dbscan can be up to 2.5x faster than passing a metrics function (if # dbscan can be up to 2.5x faster than passing a metrics function to
# they both are written in pure python) to fit df. If they are both # fit df (if using pure python). If pre-computing the matrix using
# written in fortran and interfaced via f2py3, or then written in # jit-compilation via numba, then the relative speed up can reach
# python but jit-compiled with numba, then the relative speed up # up to 120x.
# can reach up to 120x.
distance_matrix = calc_distance_matrix( distance_matrix = calc_distance_matrix(
# Drop columns that won't be used in the clustering df=df, config=config, domain=domain, num_threads=n_jobs
df.drop(list(skip), axis=1),
weights,
optimize_mode=distance_matrix_optimize_mode,
num_threads=n_jobs,
) )
# Running clustering with the computed distance matrix # Running clustering with the computed distance matrix
...@@ -265,8 +187,8 @@ def run_clustering_on_df( ...@@ -265,8 +187,8 @@ def run_clustering_on_df(
tstart = time.time() tstart = time.time()
if method == "dbscan": if method == "dbscan":
db = DBSCAN( db = DBSCAN(
eps=eps, eps=config.get_clustering_opt("eps"),
min_samples=min_samples, min_samples=config.get_clustering_opt("min_samples"),
metric="precomputed", metric="precomputed",
n_jobs=n_jobs, n_jobs=n_jobs,
).fit(distance_matrix) ).fit(distance_matrix)
...@@ -277,19 +199,19 @@ def run_clustering_on_df( ...@@ -277,19 +199,19 @@ def run_clustering_on_df(
# For more info on the parameters, see # For more info on the parameters, see
# <https://hdbscan.readthedocs.io/en/latest/parameter_selection.html> # <https://hdbscan.readthedocs.io/en/latest/parameter_selection.html>
db = HDBSCAN( db = HDBSCAN(
min_samples=min_samples, min_samples=config.get_clustering_opt("min_samples"),
min_cluster_size=min_cluster_size, min_cluster_size=config.get_clustering_opt("min_cluster_size"),
metric="precomputed", metric="precomputed",
core_dist_n_jobs=n_jobs, core_dist_n_jobs=n_jobs,
allow_single_cluster=True, allow_single_cluster=True,
# Default cluster_selection_method: 'eom'. Sometimes it leads to cluster_selection_method=config.get_clustering_opt(
# clusters that are too big. Using 'leaf' seems better. "cluster_selection_method"
cluster_selection_method="leaf", ),
).fit(distance_matrix) ).fit(distance_matrix)
elif method == "optics": elif method == "optics":
db = OPTICS( db = OPTICS(
min_samples=min_samples, min_samples=config.get_clustering_opt("min_samples"),
min_cluster_size=min_cluster_size, min_cluster_size=config.get_clustering_opt("min_cluster_size"),
n_jobs=n_jobs, n_jobs=n_jobs,
metric="precomputed", metric="precomputed",
).fit(distance_matrix) ).fit(distance_matrix)
...@@ -297,13 +219,16 @@ def run_clustering_on_df( ...@@ -297,13 +219,16 @@ def run_clustering_on_df(
db = RobustSingleLinkage( db = RobustSingleLinkage(
# cut: The reachability distance value to cut the cluster # cut: The reachability distance value to cut the cluster
# heirarchy at to derive a flat cluster labelling. # heirarchy at to derive a flat cluster labelling.
cut=eps, # default=0.4 # default cut for method: 0.4
cut=config.get_clustering_opt("eps"),
# Reachability distances will be computed with regard to the # Reachability distances will be computed with regard to the
# k nearest neighbors. # k nearest neighbors.
k=min_samples, # default=5 # default k for method: 5
k=config.get_clustering_opt("min_samples"),
# Ignore any clusters in the flat clustering with size less # Ignore any clusters in the flat clustering with size less
# than gamma, and declare points in such clusters as noise points. # than gamma, and declare points in such clusters as noise points.
gamma=min_cluster_size, # default=5 # default gamma for method: 5
gamma=config.get_clustering_opt("min_cluster_size"),
metric="precomputed", metric="precomputed",
).fit(distance_matrix) ).fit(distance_matrix)
logger.debug( logger.debug(
...@@ -316,27 +241,15 @@ def run_clustering_on_df( ...@@ -316,27 +241,15 @@ def run_clustering_on_df(
# expects 'cluster_label' to be the last column in the dataframe # expects 'cluster_label' to be the last column in the dataframe
df["cluster_label"] = db.labels_ df["cluster_label"] = db.labels_
# Refine clustering if requested if remove_outliers:
# It is important to have 'cluster_label' as the last column # Refine clustering if requested
# when running the iterative refine routine # It is important to have 'cluster_label' as the last column
if outlier_rm_method: # when running the iterative refine routine
df = filter_outliers( df = filter_outliers(
df, df,
config=config,
db=db, db=db,
outlier_rm_method=outlier_rm_method,
# Args that apply only to LOF
distance_matrix=distance_matrix, distance_matrix=distance_matrix,
# Args that only apply for the iterative method
skip=skip,
max_num_refine_iter=max_num_refine_iter,
max_n_stdev_around_mean=max_n_stdev_around_mean,
trunc_perc=trunc_perc,
weights=weights,
method=method,
weights_dict=weights_dict,
eps=eps,
min_cluster_size=min_cluster_size,
min_samples=min_samples,
n_jobs=n_jobs, n_jobs=n_jobs,
reclustering_function=self_consistent_reclustering, reclustering_function=self_consistent_reclustering,
) )
...@@ -378,12 +291,13 @@ def suspendlogging(func): ...@@ -378,12 +291,13 @@ def suspendlogging(func):
@suspendlogging @suspendlogging
def self_consistent_reclustering(df, distance_matrix, **kwargs): def self_consistent_reclustering(df, config, distance_matrix, **kwargs):
"""Recluster obs in df until further clustering on df returns df itself. """Recluster obs in df until further clustering on df returns df itself.
Args: Args:
df (pandas.Dataframe): Pandas dataframe with observations and initial df (pandas.Dataframe): Pandas dataframe with observations and initial
clustering info. clustering info.
config (ParsedConfig): Parsed configs.
distance_matrix (HollowSymmetricMatrix): Matrix of distances between distance_matrix (HollowSymmetricMatrix): Matrix of distances between
the obs in df. the obs in df.
**kwargs: Keyword args passed to run_clustering_on_df. **kwargs: Keyword args passed to run_clustering_on_df.
...@@ -403,9 +317,11 @@ def self_consistent_reclustering(df, distance_matrix, **kwargs): ...@@ -403,9 +317,11 @@ def self_consistent_reclustering(df, distance_matrix, **kwargs):
df = df.drop(df[noise_mask].index) df = df.drop(df[noise_mask].index)
df = df.drop(["cluster_label"], axis=1) df = df.drop(["cluster_label"], axis=1)
df_rec = run_clustering_on_df( df_rec = run_clustering_on_df(
df, df,
outlier_rm_method=None, config=config,
remove_outliers=False,
distance_matrix=distance_matrix.subspace(i_valid_obs), distance_matrix=distance_matrix.subspace(i_valid_obs),
calc_silhouette_samples="silhouette_score" in df.columns, calc_silhouette_samples="silhouette_score" in df.columns,
**kwargs, **kwargs,
...@@ -419,7 +335,7 @@ def self_consistent_reclustering(df, distance_matrix, **kwargs): ...@@ -419,7 +335,7 @@ def self_consistent_reclustering(df, distance_matrix, **kwargs):
return df return df
def _cluster_netatmo_obs_one_domain(df, config, **kwargs): def _cluster_netatmo_obs_one_domain(df, config, domain, **kwargs):
"""Cluster netatmo obs inside a given domain. """Cluster netatmo obs inside a given domain.
Helper for the main routine cluster_netatmo_obs Helper for the main routine cluster_netatmo_obs
...@@ -427,6 +343,7 @@ def _cluster_netatmo_obs_one_domain(df, config, **kwargs): ...@@ -427,6 +343,7 @@ def _cluster_netatmo_obs_one_domain(df, config, **kwargs):
Args: Args:
df (pandas.Dataframe): Pandas dataframe with observations. df (pandas.Dataframe): Pandas dataframe with observations.
config (ParsedConfig): Parsed configs. config (ParsedConfig): Parsed configs.
domain (netatmoqc.domains.Domain): The adopted spatial domain.
**kwargs: Keyword args passed to run_clustering_on_df. **kwargs: Keyword args passed to run_clustering_on_df.
Returns: Returns:
...@@ -435,24 +352,7 @@ def _cluster_netatmo_obs_one_domain(df, config, **kwargs): ...@@ -435,24 +352,7 @@ def _cluster_netatmo_obs_one_domain(df, config, **kwargs):
""" """
time_start_clustering = time.time() time_start_clustering = time.time()
logger.debug("Performing clustering...") logger.debug("Performing clustering...")
outlier_rm_method = config.get_clustering_opt("outlier_removal.method") df = run_clustering_on_df(df=df, config=config, domain=domain, **kwargs)
df = run_clustering_on_df(
df=df,
method=config.general.clustering_method,
distance_matrix_optimize_mode=config.general.custom_metrics_optimize_mode,
weights_dict=config.get_clustering_opt("obs_weights"),
eps=config.get_clustering_opt("eps"),
min_cluster_size=config.get_clustering_opt("min_cluster_size"),
min_samples=config.get_clustering_opt("min_samples"),
outlier_rm_method=outlier_rm_method,
max_num_refine_iter=config.get_clustering_opt(
"outlier_removal.{}.max_n_iter".format(outlier_rm_method)
),
max_n_stdev_around_mean=config.get_clustering_opt(
"outlier_removal.{}.max_n_stdev".format(outlier_rm_method)
),
**kwargs,
)
time_end_clustering = time.time() time_end_clustering = time.time()
logger.debug( logger.debug(
"Done with clustering. Elapsed: %.2fs", "Done with clustering. Elapsed: %.2fs",
...@@ -523,7 +423,10 @@ def cluster_netatmo_obs(df, config, **kwargs): ...@@ -523,7 +423,10 @@ def cluster_netatmo_obs(df, config, **kwargs):
domain.n_subdomains, domain.n_subdomains,
) )
df_sub = _cluster_netatmo_obs_one_domain( df_sub = _cluster_netatmo_obs_one_domain(
df=df_sub, config=config, **pre_clustering_kwargs df=df_sub,
config=config,
domain=subdomain,
**pre_clustering_kwargs,
) )
domain_split_dfs.append(df_sub) domain_split_dfs.append(df_sub)
...@@ -566,7 +469,9 @@ def cluster_netatmo_obs(df, config, **kwargs): ...@@ -566,7 +469,9 @@ def cluster_netatmo_obs(df, config, **kwargs):
"DTG=%s: Main clustering over whole domain...", "DTG=%s: Main clustering over whole domain...",
df.metadata_dict["dtg"], df.metadata_dict["dtg"],
) )
df = _cluster_netatmo_obs_one_domain(df=df, config=config, **kwargs) df = _cluster_netatmo_obs_one_domain(
df=df, config=config, domain=domain, **kwargs
)
if df_rejected is not None: if df_rejected is not None:
# Put back eventual obs rejected at the pre-clustering step # Put back eventual obs rejected at the pre-clustering step
......
...@@ -327,6 +327,8 @@ with config_section("_template.clustering_method") as section: ...@@ -327,6 +327,8 @@ with config_section("_template.clustering_method") as section:
config_metadata.register("min_cluster_size", default=5, minval=1) config_metadata.register("min_cluster_size", default=5, minval=1)
# obs_weights not explicitly set will be internally set to 1 # obs_weights not explicitly set will be internally set to 1
config_metadata.register("obs_weights.geo_dist", default=1.0, minval=0.0) config_metadata.register("obs_weights.geo_dist", default=1.0, minval=0.0)
config_metadata.register("obs_weights.x", default=1.0, minval=0.0)
config_metadata.register("obs_weights.y", default=1.0, minval=0.0)
config_metadata.register("obs_weights.alt", default=1.0, minval=0.0) config_metadata.register("obs_weights.alt", default=1.0, minval=0.0)
config_metadata.register( config_metadata.register(
"obs_weights.temperature", default=5.0, minval=0.0 "obs_weights.temperature", default=5.0, minval=0.0
...@@ -377,11 +379,9 @@ with config_section("general") as section: ...@@ -377,11 +379,9 @@ with config_section("general") as section:
config_metadata.register("dtgs.start") config_metadata.register("dtgs.start")
config_metadata.register("dtgs.end") config_metadata.register("dtgs.end")
config_metadata.register("dtgs.cycle_length", default="3H") config_metadata.register("dtgs.cycle_length", default="3H")
# Metrics optimisaion scheme # Data cols to ignore when running clustering
config_metadata.register( config_metadata.register(