Commit 8029cbbe authored by Paulo Medeiros's avatar Paulo Medeiros
Browse files

Tweaks to clustering/outlier removal params. Fixes

Summary of main changes below:

Changed:
    - Default HDBSCAN method from "leaf" to "eom"
    - Default min_samples and min_cluster_size: 5 --> 10
    - Internal data normalisation scheme.
    - Use a more strict GLOSH outlier removal score threshold

Fixed:
    - flakehell cannot import 'MergedConfigParser'
    - InvalidIndexError caught after pandas 1.4.0 update
parents c3a3730a 1fc97669
......@@ -10,49 +10,41 @@
# list of plugins and rules for them
[tool.flakehell.plugins]
# Activate all rules for all plugins by default
"*" = ["+*"]
# Remove from flake8-bandit:
# "S403": Consider possible security implications associated with pickle
# "S404": Consider possible security implications associated with subprocess
# "S603": To allow using subprocess.call/run
# "S606": To allow using os.startfile
flake8-bandit = ["+*", "-S403", "-S404", "-S603", "-S606"]
flake8-bugbear = ["+*"]
flake8-builtins = ["+*"]
# Remove C408 from flake8-comprehensions because I think sometimes the "dict" syntax
# looks cleaner than literal "{}". Dict creation performance is not an issue here.
flake8-comprehensions = ["+*", "-C408"]
flake8-darglint = ["+*"]
flake8-docstrings = ["+*", "-D105"] # Remove "D105: Missing docstring in magic method"
flake8-eradicate = ["+*"]
flake8-logging-format = ["+*"]
flake8-mutable = ["+*"]
flake8-pytest-style = ["+*"]
mccabe = ["+*"]
pep8-naming = ["+*"]
# Exclude some errors from pycodestyle for compatibility with black.
# "E501" is for max_line_length violations. Leave this for black to handle.
# For the other excluded errors, see:
# <https://black.readthedocs.io/en/stable/the_black_code_style.html#slices>
# <https://black.readthedocs.io/en/stable/the_black_code_style.html#line-breaks-binary-operators>
pycodestyle = ["+*", "-W503", "-E203", "-E501"]
pyflakes = ["+*"]
# Disable pylint plugin at the moment. pylint will be run separately.
pylint = ["-*"]
[tool.flakehell.exceptions."*/wsgi.py"]
# Ignore "F401 (imported but unused)" in this case
pyflakes = ["-F401"]
pyflakes = ["+*", "-F401"]
# W0611: Unused import
pylint = ["-W0611"]
pylint = ["+*", "-W0611"]
[tool.flakehell.exceptions."tests/*.py"]
# Disable some flake8-bandit checks in tests:
# "S101": To allow assert use
# "S301": To Allow testing pickle/unpickle
flake8-bandit = ["-S101", "-S301"]
flake8-bandit = ["+*", "-S101", "-S301"]
# Ignore "-D105" and "-D102" (Missing docstring in public class/method) in unit tests.
# The unit tests class and method names are supposed to be self-explanatory.
flake8-docstrings = ["-D105", "-D101", "-D102"]
flake8-docstrings = ["+*", "-D105", "-D101", "-D102"]
# C0102: Black listed name. We want to use "foo", "bar", etc in the tests.
# C0103: Method name doesn't conform to snake_case naming style
# C0115: Missing class docstring
......@@ -61,4 +53,4 @@
# R0903: Too few public methods
# W0621: Redefining name from outer scope (false positive for fixtures)
# W0212: Access to a protected member _data of a client class
pylint = ["-C0102", "-C0103", "-C0115", "-C0116", "-R0201", "-R0903", "-W0621", "-W0212"]
pylint = ["+*", "-C0102", "-C0103", "-C0115", "-C0116", "-R0201", "-R0903", "-W0621", "-W0212"]
repos:
- repo: https://github.com/PyCQA/isort
rev: 718fe45817628f8033b2b47aa9ce5a2d8c890ca7
rev: 5.10.1
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: 5d33f20a2a2c85cfb521ae9c5f9254bfe9fc2fd9
rev: 21.10b0
hooks:
- id: black
language_version: python3.6
language_version: python3.8
- repo: local
hooks:
- id: pylint
......@@ -15,7 +15,12 @@ repos:
entry: pylint
language: system
types: [python]
- repo: https://github.com/flakehell/flakehell
rev: 91a470fa2adfd0010162ee0c2b54f046c053fef7
exclude: ^tests/
- repo: local
hooks:
- id: flakehell
name: flakehell
language: python
types: [file, text]
files: \.(ipynb|md|py|rst|yaml|yml)$
entry: flakehell lint
......@@ -50,7 +50,10 @@
[metrics]
# method:
# > default: "correlation_aware_euclidean"
# > choices: "correlation_aware_euclidean", "haversine_plus_manhattan"
# > choices:
# "correlation_aware_euclidean"
# "haversine_plus_manhattan"
# "haversine_plus_euclidean"
method = "correlation_aware_euclidean"
# optimize_mode:
# > default: "memory"
......@@ -113,8 +116,8 @@
#
# You can play with the values of the parameters below, but these defaults seem reasonable
[clustering_method.hdbscan]
min_samples = 5
min_cluster_size = 5
min_samples = 10
min_cluster_size = 10
[clustering_method.hdbscan.obs_weights]
# Set weights for the calculation of the pairwise distances between obs.
#
......@@ -139,15 +142,15 @@
# The rest of the file follows the same logic as above.
[clustering_method.dbscan]
eps = 10
min_samples = 5
min_samples = 10
[clustering_method.dbscan.obs_weights]
temperature = 5.0
[clustering_method.dbscan.outlier_removal]
method = 'iterative'
[clustering_method.optics]
min_samples = 5
min_cluster_size = 5
min_samples = 10
min_cluster_size = 10
[clustering_method.optics.obs_weights]
temperature = 5.0
[clustering_method.optics.outlier_removal]
......
......@@ -18,17 +18,10 @@ from flask_caching import Cache
from server import server
from netatmoqc.clustering import cluster_netatmo_obs, sort_df_by_cluster_size
from netatmoqc.config_parser import (
ParsedConfig,
UndefinedConfigValue,
read_config,
)
from netatmoqc.config_parser import ParsedConfig, UndefinedConfigValue, read_config
from netatmoqc.domains import Domain
from netatmoqc.dtgs import Dtg
from netatmoqc.load_data import (
read_netatmo_data_for_dtg,
remove_irregular_stations,
)
from netatmoqc.load_data import read_netatmo_data_for_dtg, remove_irregular_stations
from netatmoqc.logs import CustomFormatter
from netatmoqc.metrics import haversine_distance
from netatmoqc.plots import make_clustering_fig
......@@ -46,9 +39,7 @@ app = dash.Dash(
name="clustering",
server=server,
url_base_pathname="/clustering/",
meta_tags=[
{"name": "viewport", "content": "width=device-width, initial-scale=1"}
],
meta_tags=[{"name": "viewport", "content": "width=device-width, initial-scale=1"}],
)
# Fix duplicate log items
......@@ -106,13 +97,9 @@ def generate_obs_weights_panel():
# Get defaults from config file if defined. Use the ones
# defined in the calls to this function otherwise.
try:
default_from_config = config.get_clustering_opt("obs_weights", {})[
var_name
]
default_from_config = config.get_clustering_opt("obs_weights", {})[var_name]
if default_from_config is UndefinedConfigValue:
raise AttributeError(
'obs_weights not defined for "{}"'.format(var_name)
)
raise AttributeError('obs_weights not defined for "{}"'.format(var_name))
default = default_from_config
logger.debug(
'Using config file default "%s" for "%s" weight',
......@@ -189,9 +176,7 @@ def generate_control_card():
html.P("Clustering method"),
dcc.Dropdown(
id="method-select",
options=[
{"label": i, "value": i} for i in allowed_cluster_methods
],
options=[{"label": i, "value": i} for i in allowed_cluster_methods],
value=allowed_cluster_methods[0],
),
html.Br(),
......@@ -207,7 +192,7 @@ def generate_control_card():
type="number",
inputMode="numeric",
min=1,
value=5,
value=10,
step=1,
required=True,
style=dict(
......@@ -233,7 +218,7 @@ def generate_control_card():
type="number",
inputMode="numeric",
min=2,
value=5,
value=10,
step=1,
style=dict(
display="table-cell",
......@@ -298,9 +283,7 @@ def generate_control_card():
id="outlier_rm_method_div",
children=[
html.Br(),
html.P(
"Post-Clustering Outlier Removal (Optional)"
),
html.P("Post-Clustering Outlier Removal (Optional)"),
dcc.Dropdown(
id="outlier_rm_method",
options=[{"label": "None", "value": None}]
......@@ -577,9 +560,7 @@ def read_data_df(str_date, cycle):
@app.callback(
[
Output(component_id="eps_div", component_property="style"),
Output(
component_id="min_cluster_size_div", component_property="style"
),
Output(component_id="min_cluster_size_div", component_property="style"),
],
[Input(component_id="method-select", component_property="value")],
)
......@@ -676,9 +657,7 @@ def run_clustering_and_make_plot(
start_read_data = time.time()
df = read_data_df(date, cycle)
end_read_data = time.time()
logger.info(
"Done reading data. Elapsed: %.1fs", end_read_data - start_read_data
)
logger.info("Done reading data. Elapsed: %.1fs", end_read_data - start_read_data)
n_obs = len(df.index)
if n_obs == 0:
......@@ -784,22 +763,14 @@ def run_clustering_and_make_plot(
data,
dcc.Markdown("**{:.2f}**".format(silhouette_score)),
dcc.Markdown("**{}**".format(n_clusters)),
dcc.Markdown("**{} ({:.2f}%)**".format(n_accepted, 100.0 * n_accepted / n_obs)),
dcc.Markdown(
"**{} ({:.2f}%)**".format(n_accepted, 100.0 * n_accepted / n_obs)
),
dcc.Markdown(
"**{} ({:.2f}%)**".format(
n_rm_clustering, 100.0 * n_rm_clustering / n_obs
)
),
dcc.Markdown(
"**{} ({:.2f}%)**".format(
n_rm_refining, 100.0 * n_rm_refining / n_obs
)
"**{} ({:.2f}%)**".format(n_rm_clustering, 100.0 * n_rm_clustering / n_obs)
),
dcc.Markdown(
"**{} ({:.2f}%)**".format(noise_count, 100.0 * noise_count / n_obs)
"**{} ({:.2f}%)**".format(n_rm_refining, 100.0 * n_rm_refining / n_obs)
),
dcc.Markdown("**{} ({:.2f}%)**".format(noise_count, 100.0 * noise_count / n_obs)),
)
......@@ -809,9 +780,7 @@ def run_clustering_and_make_plot(
[
# Set clickmode='event+select' in the figure layout, and then
# use 'selectedData' here instead of 'clickData'
Input(
component_id="clustering_plot", component_property="selectedData"
),
Input(component_id="clustering_plot", component_property="selectedData"),
],
)
def geodist_upon_pt_pair_selection(selected_data):
......
......@@ -15,10 +15,7 @@ from server import server
from netatmoqc.config_parser import read_config
from netatmoqc.domains import Domain
from netatmoqc.load_data import (
read_netatmo_data_for_dtg,
remove_irregular_stations,
)
from netatmoqc.load_data import read_netatmo_data_for_dtg, remove_irregular_stations
from netatmoqc.logs import CustomFormatter
from netatmoqc.plots import generate_single_frame, init_fig_dict
......@@ -34,9 +31,7 @@ app = dash.Dash(
name="scattergeo_timeseries",
server=server,
url_base_pathname="/scattergeo_timeseries/",
meta_tags=[
{"name": "viewport", "content": "width=device-width, initial-scale=1"}
],
meta_tags=[{"name": "viewport", "content": "width=device-width, initial-scale=1"}],
)
......@@ -92,9 +87,7 @@ def generate_control_card():
html.Br(),
html.Div(
id="plot-btn-outer",
children=html.Button(
id="plot-btn", children="Make Plot", n_clicks=0
),
children=html.Button(id="plot-btn", children="Make Plot", n_clicks=0),
),
],
)
......@@ -149,9 +142,7 @@ def prepare_animation(n_clicks, start_date, end_date, dataset_var):
if n_clicks == 0:
return domain.get_fig(max_ngrid=0)
fig_dict, sliders_dict = init_fig_dict(
domain, dataset_var, frame_duration=300
)
fig_dict, sliders_dict = init_fig_dict(domain, dataset_var, frame_duration=300)
# Determine map boundaries and max/min plotted values
minval_dataset_var = float("inf")
......@@ -172,9 +163,7 @@ def prepare_animation(n_clicks, start_date, end_date, dataset_var):
for idtg, dtg in enumerate(pd.date_range(start_date, end_date, freq="3H")):
logger.info("Reading data for %s", dtg)
df = read_netatmo_data_for_dtg(
dtg, rootdir=config.general.data_rootdir
)
df = read_netatmo_data_for_dtg(dtg, rootdir=config.general.data_rootdir)
df, _ = remove_irregular_stations(df)
logger.debug(" * Done. Now adding frame.")
......
......@@ -16,7 +16,7 @@ from .commands_functions import (
)
class StoreDictKeyPair(argparse.Action):
class StoreDictKeyPair(argparse.Action): # pylint: disable=too-few-public-methods
"""Enable args="key1=val1, ..., keyN=valN" in command line args."""
# Source: <https://stackoverflow.com/questions/29986185/
......@@ -56,9 +56,7 @@ def get_parsed_args(program_name):
fpath = Path(os.getenv("NETATMOQC_CONFIG_PATH", "config.toml"))
default_conf_path = fpath.resolve(strict=True)
except FileNotFoundError:
default_conf_path = (
Path(os.getenv("HOME")) / ".netatmoqc" / "config.toml"
)
default_conf_path = Path(os.getenv("HOME")) / ".netatmoqc" / "config.toml"
parser.add_argument(
"--version", "-v", action="version", version="%(prog)s v" + __version__
)
......
......@@ -70,8 +70,7 @@ def sort_df_by_cluster_size(df):
# mess up the sorting performed above.
unique_labels = df[original_cluster_label_col].unique()
_labels_old2new = {
old: new
for new, old in enumerate(lab for lab in unique_labels if lab >= 0)
old: new for new, old in enumerate(lab for lab in unique_labels if lab >= 0)
}
@np.vectorize
......@@ -164,7 +163,7 @@ def run_clustering_on_df(
method = config.general.clustering_method.lower()
# Compute clustering using DBSCAN or HDBSCAN
if method not in ["dbscan", "hdbscan", "rsl", "optics"]:
raise NotImplementedError('Method "{}" not available.'.format(method))
raise NotImplementedError(f'Method "{method}" not available.')
if len(df.index) == 0:
logger.warning("Dataframe has no rows")
df["cluster_label"] = None
......@@ -455,9 +454,7 @@ def cluster_netatmo_obs(df, config, **kwargs):
# Reset df: Only accepted obs will be passed on to the whole-domain
# clustering. Rejections will be added again to df after that.
df_rejected = df_rejoined_split[
df_rejoined_split["cluster_label"] < 0
].copy()
df_rejected = df_rejoined_split[df_rejoined_split["cluster_label"] < 0].copy()
cols_to_drop = [c for c in df_rejected.columns if c not in df.columns]
df = df_rejoined_split
df = df[~df["id"].isin(df_rejected["id"])].drop(cols_to_drop, axis=1)
......@@ -469,16 +466,12 @@ def cluster_netatmo_obs(df, config, **kwargs):
"DTG=%s: Main clustering over whole domain...",
df.metadata_dict["dtg"],
)
df = _cluster_netatmo_obs_one_domain(
df=df, config=config, domain=domain, **kwargs
)
df = _cluster_netatmo_obs_one_domain(df=df, config=config, domain=domain, **kwargs)
if df_rejected is not None:
# Put back eventual obs rejected at the pre-clustering step
if "original_cluster_label" in df.columns:
df_rejected["original_cluster_label"] = df_rejected[
"cluster_label"
].copy()
df_rejected["original_cluster_label"] = df_rejected["cluster_label"].copy()
df = pd.concat([df, df_rejected], ignore_index=True)
# Now we're done.
......
......@@ -70,20 +70,15 @@ def cluster_obs_single_dtg(args):
if args.savefig:
# Create outdir at the beginning so users don't
# waste time in case they can't save results
outdir = config.general.outdir / "{}_netatmoqc_cluster".format(
datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
)
now_as_str = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
outdir = config.general.outdir / f"{now_as_str}_netatmoqc_cluster"
# Allow mkdir to raise eventual exceptions if cannot write to outdir
outdir.mkdir(parents=True)
try:
df = read_netatmo_data_for_dtg(
dtg, rootdir=config.general.data_rootdir
)
df = read_netatmo_data_for_dtg(dtg, rootdir=config.general.data_rootdir)
except DataNotFoundError:
logger.warning(
"Could not cluster obs for dtg=%s: ", dtg, exc_info=True
)
logger.warning("Could not cluster obs for dtg=%s: ", dtg, exc_info=True)
return
df, _ = remove_irregular_stations(df)
......@@ -102,9 +97,7 @@ def cluster_obs_single_dtg(args):
if args.show:
fig.show(config=DEF_FIGSHOW_CONFIG)
logger.info(
"%sDone with 'cluster' command.%s", logcolor.cyan, logcolor.reset
)
logger.info("%sDone with 'cluster' command.%s", logcolor.cyan, logcolor.reset)
########################################
......@@ -119,7 +112,11 @@ def _select_stations_single_dtg(dtg, config, args):
args (argparse.Namespace): Parsed command line arguments.
Returns:
(DataFrame, DataFrame, DataFrame): df_accepted, df_rejected, df_moving
tuple containing:
pandas.DataFrame: Accepted stations.
pandas.DataFrame: Rejected stations.
pandas.DataFrame: Moving stations.
"""
tstart = time.time()
......@@ -135,9 +132,7 @@ def _select_stations_single_dtg(dtg, config, args):
cpu_share = multiprocessing.cpu_count() // proc_family_size
try:
df = read_netatmo_data_for_dtg(
dtg, rootdir=config.general.data_rootdir
)
df = read_netatmo_data_for_dtg(dtg, rootdir=config.general.data_rootdir)
except DataNotFoundError:
logger.warning("Could not select obs for dtg=%s: ", dtg, exc_info=True)
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
......@@ -177,9 +172,9 @@ def select_stations(args):
# Create outdir at the beginning so users don't
# waste time in case they can't save results
outdir = config.general.outdir / "{}_netatmoqc_select".format(
datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
)
now_as_str = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
outdir = config.general.outdir / f"{now_as_str}_netatmoqc_select"
# Allow mkdir to raise eventual exceptions if cannot write to outdir
outdir.mkdir(parents=True)
......@@ -250,17 +245,12 @@ def select_stations(args):
# Using a rescued_rejected_stats list to modify the dataframes
# df_accepted/df_rejected is much faster than modifying
# the dataframes inside the loop above
rescued_rows = df_rejected.loc[
df_rejected["id"].isin(rescued_rejected_stats), :
]
rescued_rows = df_rejected.loc[df_rejected["id"].isin(rescued_rejected_stats), :]
df_accepted = df_accepted.append(rescued_rows, ignore_index=True)
df_rejected = df_rejected.drop(rescued_rows.index)
if len(rescued_rejected_stats) > 0:
logger.info(
(
" > Rescuing %d stations rejected in "
"less than %.1f%% of occurences"
),
(" > Rescuing %d stations rejected in " "less than %.1f%% of occurences"),
len(rescued_rejected_stats),
100 * config.commands.select.station_rejection_tol,
)
......@@ -316,9 +306,7 @@ def select_stations(args):
coarse_grid_factor = config.domain.thinning_grid_coarse_factor
if coarse_grid_factor > 0:
domain = Domain.construct_from_dict(config.domain)
logger.info(
"Thinning accepted obs: Keep only 1 station per support grid point"
)
logger.info("Thinning accepted obs: Keep only 1 station per support grid point")
logger.info(
" > Support grid spacing: %.1f m (%d x the domain's)",
domain.thinning_grid.x_spacing,
......@@ -345,9 +333,7 @@ def select_stations(args):
# (c) Thin data keeping only the first entry found at each (i, j).
# As we've sorted by rejection rate (lower to higher), all but the
# lowest-rejection-rate station at each grid (i, j) will be kept.
grid_trimmed_stations = domain.thinning_grid.thin_obs(
df_accepted, method="first"
)
grid_trimmed_stations = domain.thinning_grid.thin_obs(df_accepted, method="first")
# (d) Finally, move to df_rejected those stations that appear in
# df_accepted but not in grid_trimmed_stations
......@@ -457,9 +443,9 @@ def csv2obsoul(args):
# Create outdir at the beginning so users don't
# waste time in case they can't save results
outdir = config.general.outdir / "{}_netatmoqc_csv2obsoul".format(
datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
)
now_as_str = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
outdir = config.general.outdir / f"{now_as_str}_netatmoqc_csv2obsoul"
# Allow mkdir to raise eventual exceptions if cannot write to outdir
outdir.mkdir(parents=True)
......@@ -476,7 +462,7 @@ def csv2obsoul(args):
else:
raise NotImplementedError(
'Only csv files supported in "--selected-stations-fpath". '
"Received '%s'." % (args.selected_stations_fpath),
f"Received '{args.selected_stations_fpath}'."
)
netatmoqc_input2output(
......@@ -495,9 +481,7 @@ def csv2obsoul(args):
obsoul_export_params=config.general.obsoul_export_params,
)
logger.info(
"%sDone with 'csv2obsoul' command.%s", logcolor.cyan, logcolor.reset
)
logger.info("%sDone with 'csv2obsoul' command.%s", logcolor.cyan, logcolor.reset)
######################################
......@@ -516,9 +500,8 @@ def thin_data_from_csv_files(args):
domain = Domain.construct_from_dict(config.domain)
outdir_prefix = config.general.outdir / "{}_netatmoqc_thin".format(
datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
)
now_as_str = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
outdir_prefix = config.general.outdir / f"{now_as_str}_netatmoqc_thin"
# Parse input paths. Keep file paths as they are, and find csv files
# recursively for paths that are directories.
......@@ -531,9 +514,7 @@ def thin_data_from_csv_files(args):
for fpath in file_list:
if fpath.suffix != ".csv":
logger.warning(
"Only csv files supported. Skipping file '%s'", fpath
)
logger.warning("Only csv files supported. Skipping file '%s'", fpath)
continue
logger.info("Parsing data from file %s", fpath)
......@@ -564,7 +545,7 @@ def thin_data_from_csv_files(args):
######################################
def _open_file_with_default_app(fpath):
if platform.system() == "Windows":
os.startfile(fpath)
os.startfile(fpath) # pylint: disable=no-member
elif platform.system() == "Darwin":
subprocess.call(("open", fpath))
else:
......@@ -608,9 +589,7 @@ def show(args):
logger.info("Openning file '%s'", fpath)
_open_file_with_default_app(fpath)
else:
logger.warning(
"Only html and csv files supported. Skipping file '%s'", fpath
)
logger.warning("Only html and csv files supported. Skipping file '%s'", fpath)
if len(dataframes) > 0:
fig = show_cmd_get_fig_from_dataframes(args, dataframes, domain)
......
......@@ -65,9 +65,7 @@ class UndefinedValueType:
if self._return_self_on_attr_error:
return self
raise AttributeError(
"'{}' object has no attribute '{}'".format(
self.__class__.__name__, item
)
f"'{self.__class__.__name__}' object has no attribute '{item}'"
)
def __copy__(self):
......@@ -177,13 +175,13 @@ class ConfigDict(DotMap):
else:
super().__setitem__(attr, val)
def set_dynamic_flags(self, boolean):
"""Recursively set "_dynamic" to True/False."""
def recursively_set_dynamic_flags(self, boolean):
"""Recursively set the "_dynamic" attribute to True/False."""
boolean = bool(boolean)
self._dynamic = boolean
for key, val in self.items():
if isinstance(val, type(self)):
val.set_dynamic_flags(boolean)
val.recursively_set_dynamic_flags(boolean)
self[key] = val