Commit 5a8b6aa9 authored by Paulo Medeiros's avatar Paulo Medeiros
Browse files

Bugfixes, changes to metrics & clustering opts.

Summary of main changes below:

Added:
    - New metrics calculation methods:
        - correlation_aware_euclidean (the new default)
        - haversine_plus_euclidean
        - haversine_plus_manhattan (the only one implemented previously)
    - "unclusterable_data_columns" general config option
    - Allow choice of HDBSCAN's cluster_selection_method

Changed:
    - Default HDBSCAN method from "leaf" to "eom"
    - Default min_samples and min_cluster_size: 5 --> 10
    - Changed internal data normalisation scheme
    - Metrics has now its own section in config file
    - Use a more strict GLOSH outlier removal score threshold
    - Visualised map uses same proj params as the configured in domain
    - Remove unused "tstep" from domain configs

Fixed:
    - InvalidIndexError caught after pandas 1.4.0 update
    - Some crashes in outlier removal methods (solves #5)
    - flakehell cannot import 'MergedConfigParser'
    - Some warnings
parents c615173b f148f400
Pipeline #10662 passed with stages
in 3 minutes and 28 seconds
......@@ -10,49 +10,41 @@
# list of plugins and rules for them
[tool.flakehell.plugins]
# Activate all rules for all plugins by default
"*" = ["+*"]
# Remove from flake8-bandit:
# "S403": Consider possible security implications associated with pickle
# "S404": Consider possible security implications associated with subprocess
# "S603": To allow using subprocess.call/run
# "S606": To allow using os.startfile
flake8-bandit = ["+*", "-S403", "-S404", "-S603", "-S606"]
flake8-bugbear = ["+*"]
flake8-builtins = ["+*"]
# Remove C408 from flake8-comprehensions because I think sometimes the "dict" syntax
# looks cleaner than literal "{}". Dict creation performance is not an issue here.
flake8-comprehensions = ["+*", "-C408"]
flake8-darglint = ["+*"]
flake8-docstrings = ["+*", "-D105"] # Remove "D105: Missing docstring in magic method"
flake8-eradicate = ["+*"]
flake8-logging-format = ["+*"]
flake8-mutable = ["+*"]
flake8-pytest-style = ["+*"]
mccabe = ["+*"]
pep8-naming = ["+*"]
# Exclude some errors from pycodestyle for compatibility with black.
# "E501" is for max_line_length violations. Leave this for black to handle.
# For the other excluded errors, see:
# <https://black.readthedocs.io/en/stable/the_black_code_style.html#slices>
# <https://black.readthedocs.io/en/stable/the_black_code_style.html#line-breaks-binary-operators>
pycodestyle = ["+*", "-W503", "-E203", "-E501"]
pyflakes = ["+*"]
# Disable pylint plugin at the moment. pylint will be run separately.
pylint = ["-*"]
[tool.flakehell.exceptions."*/wsgi.py"]
# Ignore "F401 (imported but unused)" in this case
pyflakes = ["-F401"]
pyflakes = ["+*", "-F401"]
# W0611: Unused import
pylint = ["-W0611"]
pylint = ["+*", "-W0611"]
[tool.flakehell.exceptions."tests/*.py"]
# Disable some flake8-bandit checks in tests:
# "S101": To allow assert use
# "S301": To Allow testing pickle/unpickle
flake8-bandit = ["-S101", "-S301"]
flake8-bandit = ["+*", "-S101", "-S301"]
# Ignore "-D105" and "-D102" (Missing docstring in public class/method) in unit tests.
# The unit tests class and method names are supposed to be self-explanatory.
flake8-docstrings = ["-D105", "-D101", "-D102"]
flake8-docstrings = ["+*", "-D105", "-D101", "-D102"]
# C0102: Black listed name. We want to use "foo", "bar", etc in the tests.
# C0103: Method name doesn't conform to snake_case naming style
# C0115: Missing class docstring
......@@ -61,4 +53,4 @@
# R0903: Too few public methods
# W0621: Redefining name from outer scope (false positive for fixtures)
# W0212: Access to a protected member _data of a client class
pylint = ["-C0102", "-C0103", "-C0115", "-C0116", "-R0201", "-R0903", "-W0621", "-W0212"]
pylint = ["+*", "-C0102", "-C0103", "-C0115", "-C0116", "-R0201", "-R0903", "-W0621", "-W0212"]
......@@ -18,3 +18,4 @@ poetry.lock
cluster_obs_single_dtg.html
select_stations.html
dist/
.idea/
......@@ -11,10 +11,9 @@ stages:
# Template code for dependency install
.install_deps:
script:
- curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python3
- source $HOME/.poetry/env
- ls poetry.lock || poetry lock
- poetry install -v --remove-untracked
- poetry --version || curl -sSL https://install.python-poetry.org | python3 -
- poetry update -v -n
- poetry install -v -n --remove-untracked
create_py38:
stage: "Prepare Environments"
......
default_language_version:
python: python3.8
repos:
- repo: https://github.com/PyCQA/isort
rev: 718fe45817628f8033b2b47aa9ce5a2d8c890ca7
rev: 5.10.1
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: 5d33f20a2a2c85cfb521ae9c5f9254bfe9fc2fd9
rev: 22.1.0
hooks:
- id: black
language_version: python3.6
- repo: local
hooks:
- id: pylint
......@@ -15,7 +17,12 @@ repos:
entry: pylint
language: system
types: [python]
- repo: https://github.com/flakehell/flakehell
rev: 91a470fa2adfd0010162ee0c2b54f046c053fef7
exclude: ^tests/
- repo: local
hooks:
- id: flakehell
name: flakehell
language: python
types: [file, text]
files: \.(ipynb|md|py|rst|yaml|yml)$
entry: flakehell lint
......@@ -46,14 +46,23 @@
# Note that the interval is closed at the left and open at the right.
dtgs.cycle_length = '3H' # Default = '3H'
#
# custom_metrics_optimize_mode:
[metrics]
# method:
# > default: "correlation_aware_euclidean"
# > choices:
# "correlation_aware_euclidean"
# "haversine_plus_manhattan"
# "haversine_plus_euclidean"
method = "correlation_aware_euclidean"
# optimize_mode:
# > default: "memory"
# > choices: "memory", "speed_mem_compromise", "speed"
# In terms of memory usage:
# "memory" < "speed_mem_compromise" < "speed"
# In terms of execution time:
# "memory" > "speed_mem_compromise" > "speed"
custom_metrics_optimize_mode = "memory"
optimize_mode = "memory"
##################################
# Options controlling the domain #
......@@ -64,7 +73,6 @@
# <https://hirlam.org/trac/wiki/HarmonieSystemDocumentation/ModelDomain>
# <https://hirlam.org/trac/browser/Harmonie/scr/Harmonie_domains.pm>
name = ""
tstep = 75
nlon = 900
nlat = 960
lonc = 16.763011639
......@@ -108,8 +116,8 @@
#
# You can play with the values of the parameters below, but these defaults seem reasonable
[clustering_method.hdbscan]
min_samples = 5
min_cluster_size = 5
min_samples = 10
min_cluster_size = 10
[clustering_method.hdbscan.obs_weights]
# Set weights for the calculation of the pairwise distances between obs.
#
......@@ -134,15 +142,15 @@
# The rest of the file follows the same logic as above.
[clustering_method.dbscan]
eps = 10
min_samples = 5
min_samples = 10
[clustering_method.dbscan.obs_weights]
temperature = 5.0
[clustering_method.dbscan.outlier_removal]
method = 'iterative'
[clustering_method.optics]
min_samples = 5
min_cluster_size = 5
min_samples = 10
min_cluster_size = 10
[clustering_method.optics.obs_weights]
temperature = 5.0
[clustering_method.optics.outlier_removal]
......
#!/usr/bin/env python3
"""Common definitions."""
import numba
try:
# From python3.8
from importlib.metadata import version
......@@ -11,3 +13,8 @@ try:
__version__ = version(__name__)
except ModuleNotFoundError:
__version__ = "?"
# Set the threading layer before any parallel target compilation.
# Picking "omp" specifically to avoid warnings about old TBB versions.
# See <http://numba.pydata.org/numba-doc/latest/user/threading-layer.html>
numba.config.THREADING_LAYER = "omp"
......@@ -7,28 +7,19 @@ import time
from datetime import datetime
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_table
import numpy as np
import pandas as pd
import redis
from dash import dash_table, dcc, html
from dash.dependencies import Input, Output, State
from flask_caching import Cache
from server import server
from netatmoqc.clustering import cluster_netatmo_obs, sort_df_by_cluster_size
from netatmoqc.config_parser import (
ParsedConfig,
UndefinedConfigValue,
read_config,
)
from netatmoqc.config_parser import ParsedConfig, UndefinedConfigValue, read_config
from netatmoqc.domains import Domain
from netatmoqc.dtgs import Dtg
from netatmoqc.load_data import (
read_netatmo_data_for_dtg,
remove_irregular_stations,
)
from netatmoqc.load_data import read_netatmo_data_for_dtg, remove_irregular_stations
from netatmoqc.logs import CustomFormatter
from netatmoqc.metrics import haversine_distance
from netatmoqc.plots import make_clustering_fig
......@@ -46,9 +37,7 @@ app = dash.Dash(
name="clustering",
server=server,
url_base_pathname="/clustering/",
meta_tags=[
{"name": "viewport", "content": "width=device-width, initial-scale=1"}
],
meta_tags=[{"name": "viewport", "content": "width=device-width, initial-scale=1"}],
)
# Fix duplicate log items
......@@ -106,13 +95,9 @@ def generate_obs_weights_panel():
# Get defaults from config file if defined. Use the ones
# defined in the calls to this function otherwise.
try:
default_from_config = config.get_clustering_opt("obs_weights", {})[
var_name
]
default_from_config = config.get_clustering_opt("obs_weights", {})[var_name]
if default_from_config is UndefinedConfigValue:
raise AttributeError(
'obs_weights not defined for "{}"'.format(var_name)
)
raise AttributeError('obs_weights not defined for "{}"'.format(var_name))
default = default_from_config
logger.debug(
'Using config file default "%s" for "%s" weight',
......@@ -189,9 +174,7 @@ def generate_control_card():
html.P("Clustering method"),
dcc.Dropdown(
id="method-select",
options=[
{"label": i, "value": i} for i in allowed_cluster_methods
],
options=[{"label": i, "value": i} for i in allowed_cluster_methods],
value=allowed_cluster_methods[0],
),
html.Br(),
......@@ -207,7 +190,7 @@ def generate_control_card():
type="number",
inputMode="numeric",
min=1,
value=5,
value=10,
step=1,
required=True,
style=dict(
......@@ -233,7 +216,7 @@ def generate_control_card():
type="number",
inputMode="numeric",
min=2,
value=5,
value=10,
step=1,
style=dict(
display="table-cell",
......@@ -263,6 +246,34 @@ def generate_control_card():
),
html.Br(),
#
html.Div(
id="metrics_method_div",
children=[
html.Br(),
html.P("Metrics Calculation Method"),
dcc.Dropdown(
id="metrics_method",
options=[
{
"label": "Correlation-Aware Euclidean",
"value": "correlation_aware_euclidean",
},
{
"label": "Haversine + Manhattan",
"value": "haversine_plus_manhattan",
},
{
"label": "Haversine + Euclidean",
"value": "haversine_plus_euclidean",
},
],
value="correlation_aware_euclidean",
),
],
style={"display": "block", "text-align": "center"},
),
html.Br(),
#
html.Div(
id="optionals_div",
children=[
......@@ -270,9 +281,7 @@ def generate_control_card():
id="outlier_rm_method_div",
children=[
html.Br(),
html.P(
"Post-Clustering Outlier Removal (Optional)"
),
html.P("Post-Clustering Outlier Removal (Optional)"),
dcc.Dropdown(
id="outlier_rm_method",
options=[{"label": "None", "value": None}]
......@@ -549,9 +558,7 @@ def read_data_df(str_date, cycle):
@app.callback(
[
Output(component_id="eps_div", component_property="style"),
Output(
component_id="min_cluster_size_div", component_property="style"
),
Output(component_id="min_cluster_size_div", component_property="style"),
],
[Input(component_id="method-select", component_property="value")],
)
......@@ -604,6 +611,7 @@ def show_hide_max_num_refining_iter(outlier_rm_method):
State("eps", "value"),
State("date-picker-select", "date"),
State("cycle-select", "value"),
State("metrics_method", "value"),
State("outlier_rm_method", "value"),
State("max_num_refine_iter", "value"),
State("max_n_std_around_mean", "value"),
......@@ -624,6 +632,7 @@ def run_clustering_and_make_plot(
eps,
date,
cycle,
metrics_method,
outlier_rm_method,
max_num_refining_iter,
refine_max_std,
......@@ -646,9 +655,7 @@ def run_clustering_and_make_plot(
start_read_data = time.time()
df = read_data_df(date, cycle)
end_read_data = time.time()
logger.info(
"Done reading data. Elapsed: %.1fs", end_read_data - start_read_data
)
logger.info("Done reading data. Elapsed: %.1fs", end_read_data - start_read_data)
n_obs = len(df.index)
if n_obs == 0:
......@@ -660,12 +667,15 @@ def run_clustering_and_make_plot(
{
"general": dict(
clustering_method=method,
custom_metrics_optimize_mode=config.general.custom_metrics_optimize_mode,
dtgs=dict(
list=config.general.dtgs,
cycle_length=config.general.dtgs.cycle_length.freqstr,
),
),
"metrics": dict(
method=metrics_method,
optimize_mode=config.metrics.optimize_mode,
),
"clustering_method.%s"
% (method): dict(
eps=eps,
......@@ -751,22 +761,14 @@ def run_clustering_and_make_plot(
data,
dcc.Markdown("**{:.2f}**".format(silhouette_score)),
dcc.Markdown("**{}**".format(n_clusters)),
dcc.Markdown("**{} ({:.2f}%)**".format(n_accepted, 100.0 * n_accepted / n_obs)),
dcc.Markdown(
"**{} ({:.2f}%)**".format(n_accepted, 100.0 * n_accepted / n_obs)
"**{} ({:.2f}%)**".format(n_rm_clustering, 100.0 * n_rm_clustering / n_obs)
),
dcc.Markdown(
"**{} ({:.2f}%)**".format(
n_rm_clustering, 100.0 * n_rm_clustering / n_obs
)
),
dcc.Markdown(
"**{} ({:.2f}%)**".format(
n_rm_refining, 100.0 * n_rm_refining / n_obs
)
),
dcc.Markdown(
"**{} ({:.2f}%)**".format(noise_count, 100.0 * noise_count / n_obs)
"**{} ({:.2f}%)**".format(n_rm_refining, 100.0 * n_rm_refining / n_obs)
),
dcc.Markdown("**{} ({:.2f}%)**".format(noise_count, 100.0 * noise_count / n_obs)),
)
......@@ -776,9 +778,7 @@ def run_clustering_and_make_plot(
[
# Set clickmode='event+select' in the figure layout, and then
# use 'selectedData' here instead of 'clickData'
Input(
component_id="clustering_plot", component_property="selectedData"
),
Input(component_id="clustering_plot", component_property="selectedData"),
],
)
def geodist_upon_pt_pair_selection(selected_data):
......
......@@ -5,20 +5,16 @@ import os
from datetime import datetime as dt
import dash
import dash_core_components as dcc
import dash_html_components as html
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from dash import dcc, html
from dash.dependencies import Input, Output, State
from server import server
from netatmoqc.config_parser import read_config
from netatmoqc.domains import Domain
from netatmoqc.load_data import (
read_netatmo_data_for_dtg,
remove_irregular_stations,
)
from netatmoqc.load_data import read_netatmo_data_for_dtg, remove_irregular_stations
from netatmoqc.logs import CustomFormatter
from netatmoqc.plots import generate_single_frame, init_fig_dict
......@@ -34,9 +30,7 @@ app = dash.Dash(
name="scattergeo_timeseries",
server=server,
url_base_pathname="/scattergeo_timeseries/",
meta_tags=[
{"name": "viewport", "content": "width=device-width, initial-scale=1"}
],
meta_tags=[{"name": "viewport", "content": "width=device-width, initial-scale=1"}],
)
......@@ -92,9 +86,7 @@ def generate_control_card():
html.Br(),
html.Div(
id="plot-btn-outer",
children=html.Button(
id="plot-btn", children="Make Plot", n_clicks=0
),
children=html.Button(id="plot-btn", children="Make Plot", n_clicks=0),
),
],
)
......@@ -149,9 +141,7 @@ def prepare_animation(n_clicks, start_date, end_date, dataset_var):
if n_clicks == 0:
return domain.get_fig(max_ngrid=0)
fig_dict, sliders_dict = init_fig_dict(
domain, dataset_var, frame_duration=300
)
fig_dict, sliders_dict = init_fig_dict(domain, dataset_var, frame_duration=300)
# Determine map boundaries and max/min plotted values
minval_dataset_var = float("inf")
......@@ -172,9 +162,7 @@ def prepare_animation(n_clicks, start_date, end_date, dataset_var):
for idtg, dtg in enumerate(pd.date_range(start_date, end_date, freq="3H")):
logger.info("Reading data for %s", dtg)
df = read_netatmo_data_for_dtg(
dtg, rootdir=config.general.data_rootdir
)
df = read_netatmo_data_for_dtg(dtg, rootdir=config.general.data_rootdir)
df, _ = remove_irregular_stations(df)
logger.debug(" * Done. Now adding frame.")
......
......@@ -16,7 +16,7 @@ from .commands_functions import (
)
class StoreDictKeyPair(argparse.Action):
class StoreDictKeyPair(argparse.Action): # pylint: disable=too-few-public-methods
"""Enable args="key1=val1, ..., keyN=valN" in command line args."""
# Source: <https://stackoverflow.com/questions/29986185/
......@@ -56,9 +56,7 @@ def get_parsed_args(program_name):
fpath = Path(os.getenv("NETATMOQC_CONFIG_PATH", "config.toml"))
default_conf_path = fpath.resolve(strict=True)
except FileNotFoundError:
default_conf_path = (
Path(os.getenv("HOME")) / ".netatmoqc" / "config.toml"
)
default_conf_path = Path(os.getenv("HOME")) / ".netatmoqc" / "config.toml"
parser.add_argument(
"--version", "-v", action="version", version="%(prog)s v" + __version__
)
......
......@@ -70,8 +70,7 @@ def sort_df_by_cluster_size(df):
# mess up the sorting performed above.
unique_labels = df[original_cluster_label_col].unique()
_labels_old2new = {
old: new
for new, old in enumerate(lab for lab in unique_labels if lab >= 0)
old: new for new, old in enumerate(lab for lab in unique_labels if lab >= 0)
}
@np.vectorize
......@@ -89,51 +88,6 @@ def sort_df_by_cluster_size(df):
return df.drop("parent_cluster_size", axis=1).reset_index(drop=True)
def weights_dict_to_np_array(
df, pairwise_diff_weights=None, skip=("id", "time_utc"), default=1
):
"""Convert pairwise_diff_weights into a numpy array.
Takes a pandas dataframe and a {column_name:weight} dictionary and returns
an array of weights to be passed to the routine that calculates the
distance matrix.
Columns "lat" and "lon" in df are treated specially, in that they are
not assigned a weight individually, but rather a single weight gets
assigned to the "geo_dist" property.
Args:
df (pandas.Dataframe): Dataframe with observations.
pairwise_diff_weights (dict): {df_column_name:weight} dictionary.
Default value = None.
skip: df columns that will not enter the clustering and should
therefore be skipped. Default value = ("id", "time_utc")
default: Default weight to be assigned for a non-skipped df column if
the column name is present in df but not in pairwise_diff_weights.
Default value = 1.
Returns:
numpy.ndarray: weights to be passed to the routine that calculates the
distance matrix.
Raises:
ValueError: If the dataframe 'lat' column is not followed by the 'lon'
column.
"""
if df.columns.get_loc("lon") - df.columns.get_loc("lat") != 1:
raise ValueError("'lat' column is not followed by 'lon' column")
weights = []
col2weight = {c: ("geo_dist" if c == "lon" else c) for c in df.columns}
for col in df.columns[~df.columns.isin(list(skip) + ["lat"])]:
try:
weights.append(pairwise_diff_weights[col2weight[col]])
except (KeyError, TypeError):
weights.append(default)
return np.array(weights, dtype=np.float64)
def get_silhouette_samples(df, distance_matrix):
"""Calculate silhouette scores for every obs in df.
......@@ -177,50 +131,27 @@ def get_silhouette_samples(df, distance_matrix):
def run_clustering_on_df(
df,
method="hdbscan",
config,
domain,
distance_matrix=None,
distance_matrix_optimize_mode="memory",
skip=("id", "time_utc"),
weights_dict=None,
eps=15, # eps applies only to dbscan
min_cluster_size=3, # min_cluster_size applies only to hdbscan
min_samples=3,
n_jobs=-1,
outlier_rm_method=None,
max_num_refine_iter=50,
max_n_stdev_around_mean=2.0,
trunc_perc=0.25,
remove_outliers=True,
calc_silhouette_samples=True,
n_jobs=-1,
):
"""Low-level clustering routine.
Args:
df (pandas.Dataframe): Dataframe with observations.
method (str): {"hdbscan", "dbscan", "rsl", "optics"}
Clustering method.
config (netatmoqc.config.ParsedConfig): Program's general configs.
domain (netatmoqc.domains.Domain): The adopted spatial domain.
distance_matrix (HollowSymmetricMatrix): Obs distance matrix.
Default value = None.
distance_matrix_optimize_mode (str): The distance matrix optimization
mode. Default value = "memory".
skip (tuple): Columns to skip in df.
Default value = ("id", "time_utc").
weights_dict (dict): obs_name --> obs_weight map. Default value = None.
eps (float): DBSCAN's eps parameter. Default value = 15.0.
min_cluster_size (int): HDBSCAN's min_cluster_size. Default value = 3.
min_samples (int): (H)DBSCAN's min_samples. Default value = 3.
n_jobs (int): Max number of local-host parallel jobs.
Default value = -1.
outlier_rm_method (str): Outlier removal method. Default value = None.
max_num_refine_iter (int): Max number of iterations for the
"iterative" ourlier removal method. Default value = 50.
max_n_stdev_around_mean (float): Max number of stdev from cluster obs
mean for an observation to be considered OK in the "iteractive"
outlier removal method. Default value = 2.0.
trunc_perc (float): Percentage used in array truncation when
calculating stdevs and means in the "iterative" outlier removal
method. Default value = 0.25.
remove_outliers (bool): Use a post-clustering outlier removal method?<