Commit d4dba6d7 authored by Paulo Medeiros's avatar Paulo Medeiros
Browse files

Stable sandbox data converters/readers

- Combine obs in 1min windows and mean before dropna
- Change default thinning_grid_coarse_factor to zero
- Add qc_strategy config option
- Refactoring in the apps to get a proper wsgi
- "show config" command allows selecting sections
- rm data normalisation, let obs weights do this job
- some changes to config parameters and command line args
- Tweak clustering params to reduce rejection rates
- Some refactoring, optimisations and fixes

Merge branch 'refactor/load_data' into devel
parents 5639d129 55e291ef
Pipeline #20570 failed with stages
in 6 seconds
......@@ -7,6 +7,8 @@ tmp_*
temp_*
tmp\.*
temp\.*
prof/
*.tmp
*.egg-info
build/
*.csv
......
......@@ -98,9 +98,6 @@
# - lat and lon: These do not have individual weights. Use "geo_dist"
# (without quotes) to set a combined weight for the haversine
# distances calculated based on the (lat, lon) pairs.
# - pressure: You should instead set the weight for mslp in this case.
# Although the NetAtmo input files have "pressure" written in the
# header, the reported data is actually mean sea-level pressure.
#
# The weights will multiply the normalised values of the observation diffs
# in the calculation of the pairwise distance matrix. If weight<=0, then
......@@ -111,8 +108,7 @@
y = 1.0
alt = 1.0
temperature = 5.0
pressure = 0.0
mslp = 1.0
pressure = 1.0
humidity = 1.0
sum_rain_1 = 1.0
......
#!/usr/bin/env python3
"""Common definitions."""
import numba
from importlib.metadata import version
try:
# From python3.8
from importlib.metadata import version
except ModuleNotFoundError:
# Prior to python3.8
from importlib_metadata import version
import numba
try:
__version__ = version(__name__)
except ModuleNotFoundError:
__version__ = "?"
PACKAGE_NAME = __name__
# Set the threading layer before any parallel target compilation.
# Picking "omp" specifically to avoid warnings about old TBB versions.
......
This diff is collapsed.
#!/usr/bin/env python3
"""Code for the clustering app's UI."""
import contextlib
import numpy as np
from dash import dash_table, dcc, html
from pandas._libs.tslibs import to_offset
from ...config_parser import get_clustering_opt
def description_card():
"""Return a Div containing dashboard title & descriptions."""
return html.Div(
id="description-card",
children=[
html.H5("NetAtmoQC"),
html.H6("Interactive Data Clustering"),
html.Div(
id="intro",
style={"text-align": "center"},
children="QC of NetAtmo Station Data via Clustering",
),
],
style={"text-align": "center"},
)
def generate_obs_weights_panel(configs_from_file):
"""Generate the observation weights panel."""
def obs_weight_cell(var_name, default=1.0, minval=0.0, maxval=np.inf):
"""Return a div for an obs weight cell."""
# Get defaults from config file if defined. Use the ones
# defined in the calls to this function otherwise.
with contextlib.suppress(AttributeError, KeyError):
default_from_config = get_clustering_opt(
configs_from_file, f"obs_weights.{var_name}"
)
if default_from_config is None:
raise AttributeError('obs_weights not defined for "{}"'.format(var_name))
default = default_from_config
cell = html.Div(
id="{}_weight_div".format(var_name),
children=[
html.P(var_name, id="{}_weight_label".format(var_name)),
dcc.Input(
id="{}_weight".format(var_name),
type="number",
inputMode="numeric",
min=minval,
max=maxval,
value=default,
style=dict(width="90%"),
),
],
style=dict(display="table-cell"),
)
return cell
panel = html.Div(
id="obs_weights_div",
children=[
html.Details(
title="Weights for the calculated differences",
children=[
html.Summary("Weights for the calculated differences"),
html.Div(
children=[
obs_weight_cell("geo_dist"),
obs_weight_cell("alt", default=0.001),
obs_weight_cell("temperature", default=10.0),
],
style=dict(display="table-row"),
),
html.Br(),
html.Div(
children=[
obs_weight_cell("pressure", default=10.0),
obs_weight_cell("humidity", default=10.0),
obs_weight_cell("sum_rain_1"),
],
style=dict(display="table-row"),
),
],
style=dict(display="table"),
),
],
)
return panel
def generate_control_card(configs_from_file):
"""Return a Div containing controls for graphs."""
allowed_cluster_methods = ["hdbscan", "dbscan", "rsl", "optics"]
allowed_outlier_rm_method = ["Iterative", "GLOSH", "LOF", "Reclustering"]
initial_date = min(configs_from_file.general.assimilation_times).mid
cycle_length = to_offset(configs_from_file.general.assimilation_times.cycle_length)
return html.Div(
id="control-card",
children=[
html.P("QC Strategy"),
dcc.Dropdown(
id="qc_strategy",
options=[
{
"label": "First space, then observations",
"value": "first_space_then_observations",
},
{"label": "Combined Space & Obs", "value": "combined_space_and_obs"},
],
value="first_space_then_observations",
),
html.Br(),
#
html.P("Clustering Method"),
dcc.Dropdown(
id="method-select",
options=[{"label": i, "value": i} for i in allowed_cluster_methods],
value=allowed_cluster_methods[0],
),
html.Br(),
#
html.Div(
id="post_space_clustering_outlier_method_div",
children=[
html.P("Post-Clustering Outlier Removal Method"),
dcc.Dropdown(
id="post_space_clustering_outlier_method",
options=[
{"label": i, "value": i}
for i in ["lof", "zscore", "quantiles"]
],
value="lof",
),
],
),
html.Br(),
#
# Put some inputs in their own Divs to be able to put them side-by-side
html.Div(
[
html.P("min_samples"),
dcc.Input(
id="min_samples",
type="number",
inputMode="numeric",
min=1,
value=get_clustering_opt(configs_from_file, "min_samples"),
step=1,
required=True,
style={"width": "90%"},
),
],
style={"display": "table-cell"},
),
# The min_cluster_size input should not show when using dbscan.
# The way to create conditional inputs in dash is to put them in
# separate Divs and change the Divs' "display" attr to "none" in
# a callback.
# Adapted from: <https://stackoverflow.com/questions/50213761/
# changing-visibility-of-a-dash-component-by-
# updating-other-component>
html.Div(
id="min_cluster_size_div",
children=[
html.P("min_cluster_size", id="min_cluster_size_label"),
dcc.Input(
id="min_cluster_size",
type="number",
inputMode="numeric",
min=1,
value=get_clustering_opt(configs_from_file, "min_cluster_size"),
step=1,
style={"width": "90%"},
),
],
style={"display": "table-cell"},
),
# The eps input should not show when using hdbscan. Doing similarly
# to min_cluster_size above.
html.Div(
id="eps_div",
children=[
html.P("eps", id="eps_label"),
dcc.Input(
id="eps",
type="number",
inputMode="numeric",
min=0.0,
value=get_clustering_opt(
configs_from_file, "cluster_selection_epsilon"
),
style={"width": "90%"},
),
],
style={"display": "table-cell"},
),
#
html.Div(
id="metrics_method_div",
children=[
html.Br(),
html.P("Metrics Calculation Method"),
dcc.Dropdown(
id="metrics_method",
options=[
{
"label": "Correlation-Aware Euclidean",
"value": "correlation_aware_euclidean",
},
{
"label": "Haversine + Euclidean",
"value": "haversine_plus_euclidean",
},
{
"label": "Haversine + Manhattan",
"value": "haversine_plus_manhattan",
},
],
value="correlation_aware_euclidean",
),
],
style={"display": "block", "text-align": "center"},
),
#
html.Div(
id="optionals_div",
children=[
html.Div(
id="outlier_rm_method_div",
children=[
html.Br(),
html.P("Post-Clustering Outlier Removal (Optional)"),
dcc.Dropdown(
id="outlier_rm_method",
options=[{"label": "None", "value": None}]
+ [
{"label": i, "value": i.lower()}
for i in allowed_outlier_rm_method
],
value="glosh",
),
],
style={"display": "block", "text-align": "center"},
),
html.Div(
# The 'style' property of this div will be
# set via callback
id="max_num_refining_iter_div",
children=[
html.Div(
children=[
html.Br(),
html.P("Max #refining iterations"),
dcc.Input(
id="max_num_refine_iter",
type="number",
inputMode="numeric",
min=1,
value=100,
step=1,
),
],
style={"display": "table-cell", "text-align": "center"},
),
html.Div(
children=[
html.Br(),
html.P("Max #stdev around mean"),
dcc.Input(
id="max_n_std_around_mean",
type="number",
inputMode="numeric",
min=1,
value=2,
),
],
style={"display": "table-cell", "text-align": "center"},
),
],
),
],
),
html.Br(),
#
generate_obs_weights_panel(configs_from_file),
html.Br(),
html.Div(
id="date-div",
style=dict(display="table-cell"),
children=[
html.P("Time Window Length"),
dcc.Input(
id="cycle-length-picker",
type="text",
value=configs_from_file.general.assimilation_times.cycle_length,
required=True,
),
html.P("Date & Time"),
dcc.Input(
id="datetime-picker-select",
type="datetime-local",
value=initial_date.strftime("%Y-%m-%dT%H:%M:%S"),
step=cycle_length.delta.seconds,
required=True,
),
],
),
html.Br(),
html.Div(
id="plot-btn-outer",
children=[
html.Button(
id="plot-btn",
children="Run Clustering and Plot",
n_clicks=0,
),
],
),
],
)
def indicator(text, id_value):
"""Return the html.Div for one app indicator."""
# Adapted from Dash gallery's app "dash-salesforce-crm"
return html.Div(
id="{}_div".format(id_value),
children=[
html.Div(
children=[
html.P(
id=id_value,
className="indicator_value",
),
html.P(
text,
className="indicator_text",
),
],
),
],
className="indicator pretty_container",
style=dict(textAlign="center"),
)
def generate_indicators():
"""Generate the indicators used in the app."""
indicators = html.Div(
id="indicators_div",
children=[
indicator("Mean Silhouette Score", "silhouette_coeff_indicator"),
indicator("# Clusters", "nclusters_indicator"),
indicator("# Accepted Obs", "naccepted_indicator"),
indicator("# Clustering Noise", "nrejected_indicator"),
indicator("# Clustering Outliers", "nremoved_indicator"),
indicator("# Total Rejected", "nremoved_total_indicator"),
],
style=dict(display="flex"),
)
return indicators
def generate_right_column_elements():
"""Generate the elements of the right-hand side column."""
children = [
html.Div(
id="clustering_plot_div",
children=[
dcc.Graph(
id="clustering_plot",
)
],
),
html.Div(
id="clustered_data_table_card",
children=[
html.B("Data after processing by clustering algorithm"),
html.Hr(),
dash_table.DataTable(
id="clustered_data_table",
data=[],
columns=[],
export_columns="all",
sort_action="native",
filter_action="native",
# Styling
style_cell=dict(
padding="10px",
),
style_header=dict(
backgroundColor="rgb(2,21,70)",
color="white",
textAlign="center",
),
# Control table scrolling
# Don't use fixed_rows right now. It causes formatting
# issues at the moment (tested with dash v1.11 and v1.12)
style_table=dict(
maxHeight="300px",
overflowY="scroll",
overflowX="auto",
),
),
],
),
]
return children
#!/usr/bin/env python3
"""Code for the scattergeo_timeseries app."""
import logging
import os
import datetime
from datetime import datetime as dt
from pathlib import Path
import dash
import numpy as np
......@@ -10,31 +10,12 @@ import pandas as pd
import plotly.graph_objects as go
from dash import dcc, html
from dash.dependencies import Input, Output, State
from server import server
from pandas._libs.tslibs import to_offset
from netatmoqc.config_parser import ParsedConfig
from netatmoqc.domains import Domain
from netatmoqc.load_data import (
read_netatmo_data_for_time_window,
remove_irregular_stations,
)
from netatmoqc.logs import CustomFormatter
from netatmoqc.plots import generate_single_frame, init_fig_dict
logger = logging.getLogger(__name__)
logger_handler = logging.StreamHandler()
logger_handler.setFormatter(CustomFormatter())
logging.basicConfig(level=logging.INFO, handlers=[logger_handler])
config = ParsedConfig.from_file(os.getenv("NETATMOQC_CONFIG_PATH", "config.toml"))
domain = Domain.from_config(config.domain)
app = dash.Dash(
name="scattergeo_timeseries",
server=server,
url_base_pathname="/scattergeo_timeseries/",
meta_tags=[{"name": "viewport", "content": "width=device-width, initial-scale=1"}],
)
from ...datetime_utils import as_datetime
from ...domains import Domain
from ...load_data import dash_apps_read_data, remove_irregular_stations
from ...plots import generate_single_frame, init_fig_dict
def description_card():
......@@ -56,34 +37,44 @@ def description_card():
variable_list = [
"temperature",
"pressure",
"mslp",
"alt",
"humidity",
"sum_rain_1",
]
def generate_control_card():
def generate_control_card(configs_from_file):
"""Return: A Div containing controls for graphs."""
initial_date = min(configs_from_file.general.assimilation_times).mid
initial_date = dt.combine(initial_date.date(), initial_date.time())
cycle_length = to_offset(configs_from_file.general.assimilation_times.cycle_length)
final_date = dt.combine(initial_date.date(), dt.min.time())
final_date += datetime.timedelta(days=1) - cycle_length
return html.Div(
id="control-card",
children=[
html.P("Select variable"),
html.P("Variable"),
dcc.Dropdown(
id="variable-select",
options=[{"label": i, "value": i} for i in variable_list],
value=variable_list[0],
),
html.Br(),
html.P("Select time range"),
dcc.DatePickerRange(
id="date-picker-select",
minimum_nights=0,
start_date=dt(2018, 4, 1),
end_date=dt(2018, 4, 1),
min_date_allowed=dt(2018, 4, 1),
max_date_allowed=dt.today(),
initial_visible_month=dt(2018, 4, 1),
html.P("Time range"),
dcc.Input(
id="start-datetime-picker",
type="datetime-local",
value=initial_date.isoformat(),
step=cycle_length.delta.seconds,
required=True,
),
dcc.Input(
id="end-datetime-picker",
type="datetime-local",
value=final_date.isoformat(),
step=cycle_length.delta.seconds,
required=True,
),
html.Br(),
html.Br(),
......@@ -95,119 +86,125 @@ def generate_control_card():
)
app.layout = html.Div(
id="app-container",
children=[
# Left column
html.Div(
id="left-column",
className="three columns",
children=[description_card(), generate_control_card()]
+ [
html.Div(
["initial child"],
id="output-clientside",
style={"display": "none"},
)
],
),
# Right column
html.Div(
id="right-column",
className="nine columns",
children=[
html.Div(
id="scattergeo_plot_div",
children=[
html.B("My Plot"),
html.Hr(),
dcc.Graph(id="netatmo_data_plot"),
],
),