Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
iObs
WP2
Task 2.3
netatmoqc
Commits
3a6c7250
Commit
3a6c7250
authored
Jul 09, 2021
by
Paulo Medeiros
Browse files
Add unclusterable_data_columns config option
Replacing the hardcoded "skip" lists. This is clearer and more useful.
parent
336e43ea
Changes
4
Hide whitespace changes
Inline
Side-by-side
netatmoqc/clustering.py
View file @
3a6c7250
...
...
@@ -134,7 +134,6 @@ def run_clustering_on_df(
df
,
config
,
distance_matrix
=
None
,
skip
=
(
"id"
,
"time_utc"
),
outlier_rm_method
=
"config"
,
calc_silhouette_samples
=
True
,
n_jobs
=-
1
,
...
...
@@ -146,8 +145,6 @@ def run_clustering_on_df(
config (netatmoqc.config.ParsedConfig): Program's general configs.
distance_matrix (HollowSymmetricMatrix): Obs distance matrix.
Default value = None.
skip (tuple): Columns to skip in df.
Default value = ("id", "time_utc").
outlier_rm_method (str): Outlier removal method.
Default value = "config". This means that the default will be
retrieved from the corresponding value in the config argument.
...
...
@@ -180,7 +177,7 @@ def run_clustering_on_df(
# jit-compilation via numba, then the relative speed up can reach
# up to 120x.
distance_matrix
=
calc_distance_matrix
(
df
=
df
,
config
=
config
,
skip
=
skip
,
num_threads
=
n_jobs
df
=
df
,
config
=
config
,
num_threads
=
n_jobs
)
# Running clustering with the computed distance matrix
...
...
@@ -250,12 +247,12 @@ def run_clustering_on_df(
if
outlier_rm_method
:
df
=
filter_outliers
(
df
,
config
=
config
,
db
=
db
,
outlier_rm_method
=
outlier_rm_method
,
# Args that apply only to LOF
distance_matrix
=
distance_matrix
,
# Args that only apply for the iterative method
skip
=
skip
,
max_num_refine_iter
=
config
.
get_clustering_opt
(
"outlier_removal.{}.max_n_iter"
.
format
(
outlier_rm_method
)
),
...
...
netatmoqc/config_parser.py
View file @
3a6c7250
...
...
@@ -390,6 +390,10 @@ with config_section("general") as section:
default
=
"haversine"
,
choices
=
[
"correlation"
,
"haversine"
],
)
# Data cols to ignore when running clustering
config_metadata
.
register
(
"unclusterable_data_columns"
,
default
=
[
"id"
,
"time_utc"
]
)
# Data cols to export when saving obsoul output
config_metadata
.
register
(
"obsoul_export_params"
,
...
...
netatmoqc/metrics.py
View file @
3a6c7250
...
...
@@ -33,9 +33,7 @@ def haversine_distance(point1, point2):
return
rtn
def
weights_dict_to_np_array
(
df
,
pairwise_diff_weights
=
None
,
skip
=
(
"id"
,
"time_utc"
),
default
=
1
):
def
weights_dict_to_np_array
(
df
,
config
,
default
=
1
):
"""Convert pairwise_diff_weights into a numpy array.
Takes a pandas dataframe and a {column_name:weight} dictionary and returns
...
...
@@ -48,11 +46,8 @@ def weights_dict_to_np_array(
Args:
df (pandas.Dataframe): Dataframe with observations.
pairwise_diff_weights (dict): {df_column_name:weight} dictionary.
Default value = None.
skip: df columns that will not enter the clustering and should
therefore be skipped. Default value = ("id", "time_utc")
default: Default weight to be assigned for a non-skipped df column if
config (ParsedConfig): Parsed configs.
default: Default weight to be assigned to a non-skipped df column if
the column name is present in df but not in pairwise_diff_weights.
Default value = 1.
...
...
@@ -69,10 +64,12 @@ def weights_dict_to_np_array(
raise
ValueError
(
"'lat' column is not followed by 'lon' column"
)
weights
=
[]
weights_dict
=
config
.
get_clustering_opt
(
"obs_weights"
)
unclusterable_cols
=
config
.
general
.
unclusterable_data_columns
col2weight
=
{
c
:
(
"geo_dist"
if
c
==
"lon"
else
c
)
for
c
in
df
.
columns
}
for
col
in
df
.
columns
[
~
df
.
columns
.
isin
(
list
(
skip
)
+
[
"lat"
])]:
for
col
in
df
.
columns
[
~
df
.
columns
.
isin
(
unclusterable_cols
+
[
"lat"
])]:
try
:
weights
.
append
(
pairwise_diff_
weights
[
col2weight
[
col
]])
weights
.
append
(
weights
_dict
[
col2weight
[
col
]])
except
(
KeyError
,
TypeError
):
weights
.
append
(
default
)
return
np
.
array
(
weights
,
dtype
=
np
.
float64
)
...
...
@@ -295,9 +292,7 @@ def calc_distance_matrix_considering_correlation(
df
.
insert
(
0
,
"y"
,
yvals
/
1000.0
)
df
.
insert
(
0
,
"x"
,
xvals
/
1000.0
)
cols_to_skip
=
[
"id"
,
"time_utc"
]
cols_to_skip
+=
[
"lon"
,
"lat"
]
selected_data_columns
=
[
c
for
c
in
df
.
columns
if
c
not
in
cols_to_skip
]
selected_data_columns
=
[
c
for
c
in
df
.
columns
if
c
not
in
[
"lon"
,
"lat"
]]
df
=
df
[
selected_data_columns
]
weights_array
=
np
.
array
([
weights_dict
.
get
(
c
,
1.0
)
for
c
in
df
.
columns
])
...
...
@@ -316,12 +311,7 @@ def calc_distance_matrix_considering_correlation(
return
rtn
def
calc_distance_matrix
(
df
,
config
,
skip
=
None
,
num_threads
=-
1
,
):
def
calc_distance_matrix
(
df
,
config
,
num_threads
=-
1
):
"""Calculate distance matrix between obs in dataframe df.
Spatial distances are calculated by projecting (lon, lat) into (x, y)
...
...
@@ -332,11 +322,8 @@ def calc_distance_matrix(
Args:
df (pandas.Dataframe): Input data.
config (ParsedConfig): Parsed configs.
skip (list): List of datafreme columns to skip/ignore.
num_threads: Max number of threads used for the computation.
(Default value = -1)
method: The method used in the calculation of the distances.
(Default value = "haversine")
Raises:
NotImplementedError: If the passed method is not supported.
...
...
@@ -356,10 +343,11 @@ def calc_distance_matrix(
logger
.
debug
(
"Computing distance matrix using the '%s' method"
,
method
)
weights_dict
=
config
.
get_clustering_opt
(
"obs_weights"
)
df
=
df
.
copy
().
drop
(
config
.
general
.
unclusterable_data_columns
,
axis
=
1
)
if
method
==
"haversine"
:
return
calc_distance_matrix_haversine
(
# Drop columns that won't be used in the clustering
df
=
df
.
drop
(
list
(
skip
),
axis
=
1
)
,
df
=
df
,
weights
=
weights_dict_to_np_array
(
df
,
weights_dict
),
optimize_mode
=
config
.
general
.
custom_metrics_optimize_mode
,
num_threads
=
num_threads
,
...
...
netatmoqc/outlier_removal.py
View file @
3a6c7250
...
...
@@ -203,7 +203,7 @@ def _filter_outliers_iterative(
def
filter_outliers_iterative
(
df
,
skip
,
config
,
weights_dict
,
trunc_perc
=
0.25
,
max_num_refine_iter
=
1000
,
...
...
@@ -219,8 +219,7 @@ def filter_outliers_iterative(
Args:
df (pandas.Dataframe): Dataframe containing clustering info.
skip (list): List of names of columns in the df to be skipped (i.e.,
not taken into account in the filtering).
config (ParsedConfig): Parsed configs.
weights_dict (dict): Weights chosen for each observation parameter.
trunc_perc (float): Proportion of array elements to be removed for the
calculation of the truncated stds and means. Should lie between
...
...
@@ -237,7 +236,7 @@ def filter_outliers_iterative(
"""
df
[
"cluster_label"
]
=
_filter_outliers_iterative
(
df
.
drop
(
list
(
skip
)
,
axis
=
1
),
df
.
drop
(
config
.
general
.
unclusterable_data_columns
,
axis
=
1
),
max_num_iter
=
max_num_refine_iter
,
max_n_stdev_around_mean
=
max_n_stdev_around_mean
,
trunc_perc
=
trunc_perc
,
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment