Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
iObs
WP2
Task 2.3
netatmoqc
Commits
7f378cf4
Commit
7f378cf4
authored
Jul 09, 2021
by
Paulo Medeiros
Browse files
Fix a few crashes in outlier removal methods
Fixes issue
#5
parent
2193ff60
Pipeline
#9774
failed with stage
in 0 seconds
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
netatmoqc/clustering.py
View file @
7f378cf4
...
...
@@ -134,7 +134,7 @@ def run_clustering_on_df(
df
,
config
,
distance_matrix
=
None
,
outlier_rm_method
=
"config"
,
remove_outliers
=
True
,
calc_silhouette_samples
=
True
,
n_jobs
=-
1
,
):
...
...
@@ -145,9 +145,8 @@ def run_clustering_on_df(
config (netatmoqc.config.ParsedConfig): Program's general configs.
distance_matrix (HollowSymmetricMatrix): Obs distance matrix.
Default value = None.
outlier_rm_method (str): Outlier removal method.
Default value = "config". This means that the default will be
retrieved from the corresponding value in the config argument.
remove_outliers (bool): Use a post-clustering outlier removal method?
Default value = True.
calc_silhouette_samples (bool): Calculate or not silhouette_samples?
Default value = True.
n_jobs (int): Max number of local-host parallel jobs.
...
...
@@ -166,6 +165,7 @@ def run_clustering_on_df(
raise
NotImplementedError
(
'Method "{}" not available.'
.
format
(
method
))
if
len
(
df
.
index
)
==
0
:
logger
.
warning
(
"Dataframe has no rows"
)
df
[
"cluster_label"
]
=
None
return
df
# We will not do any df = StandardScaler().fit_transform(df),
...
...
@@ -239,17 +239,14 @@ def run_clustering_on_df(
# expects 'cluster_label' to be the last column in the dataframe
df
[
"cluster_label"
]
=
db
.
labels_
# Refine clustering if requested
# It is important to have 'cluster_label' as the last column
# when running the iterative refine routine
if
outlier_rm_method
==
"config"
:
outlier_rm_method
=
config
.
get_clustering_opt
(
"outlier_removal.method"
)
if
outlier_rm_method
:
if
remove_outliers
:
# Refine clustering if requested
# It is important to have 'cluster_label' as the last column
# when running the iterative refine routine
df
=
filter_outliers
(
df
,
config
=
config
,
db
=
db
,
outlier_rm_method
=
outlier_rm_method
,
distance_matrix
=
distance_matrix
,
n_jobs
=
n_jobs
,
reclustering_function
=
self_consistent_reclustering
,
...
...
@@ -322,7 +319,7 @@ def self_consistent_reclustering(df, config, distance_matrix, **kwargs):
df_rec
=
run_clustering_on_df
(
df
,
config
=
config
,
outlier_rm_method
=
Non
e
,
remove_outliers
=
Fals
e
,
distance_matrix
=
distance_matrix
.
subspace
(
i_valid_obs
),
calc_silhouette_samples
=
"silhouette_score"
in
df
.
columns
,
**
kwargs
,
...
...
@@ -352,11 +349,7 @@ def _cluster_netatmo_obs_one_domain(df, config, **kwargs):
"""
time_start_clustering
=
time
.
time
()
logger
.
debug
(
"Performing clustering..."
)
df
=
run_clustering_on_df
(
df
=
df
,
config
=
config
,
**
kwargs
,
)
df
=
run_clustering_on_df
(
df
=
df
,
config
=
config
,
**
kwargs
)
time_end_clustering
=
time
.
time
()
logger
.
debug
(
"Done with clustering. Elapsed: %.2fs"
,
...
...
netatmoqc/outlier_removal.py
View file @
7f378cf4
...
...
@@ -210,6 +210,9 @@ def filter_outliers_iterative(df, config, trunc_perc=0.25):
column's mean. Repeat the process untill either no data is removed or
the max number of iterations is reached.
It is important to have 'cluster_label' as the last column when running
this method.
Args:
df (pandas.Dataframe): Dataframe containing clustering info.
config (ParsedConfig): Parsed configs.
...
...
@@ -295,6 +298,8 @@ def get_local_outlier_factors(df, distance_matrix, calc_per_cluster=False):
all_lof_values
[
indices
]
=
clf
.
negative_outlier_factor_
else
:
indices
=
df
.
index
[
df
[
"cluster_label"
]
>
-
1
]
if
len
(
indices
)
==
0
:
return
all_lof_values
clf
=
LocalOutlierFactor
(
n_neighbors
=
3
,
metric
=
"precomputed"
)
clf
.
fit_predict
(
distance_matrix
.
subspace
(
indices
))
all_lof_values
[
indices
]
=
clf
.
negative_outlier_factor_
...
...
@@ -322,16 +327,16 @@ def filter_outliers_lof(df, distance_matrix):
# Higher-level outlier removal routine calling the specific ones defined above
def
filter_outliers
(
df
,
db
,
outlier_rm_method
,
distance_matrix
,
**
kwargs
):
def
filter_outliers
(
df
,
db
,
config
,
distance_matrix
,
**
kwargs
):
"""Filter outliers according to specified outlier detection method.
Args:
df (pandas.Dataframe): Input data with clustering info.
db (obj): Output of clustering method.
outlier_rm_method (str): Outlier detection method of choice
.
config (netatmoqc.config.ParsedConfig): Program's general configs
.
distance_matrix (netatmoqc.metrics.HollowSymmetricMatrix): Distance
matrix consistent with the input data.
**kwargs: Passed on to internal
m
wrapped routines.
**kwargs: Passed on to internal wrapped routines.
Returns:
pandas.Dataframe: Copy of input data, with clustering labels for
...
...
@@ -339,6 +344,8 @@ def filter_outliers(df, db, outlier_rm_method, distance_matrix, **kwargs):
"""
tstart
=
time
.
time
()
outlier_rm_method
=
config
.
get_clustering_opt
(
"outlier_removal.method"
)
logger
.
debug
(
' > Running outlier removal method "%s" with kwargs=%s'
,
outlier_rm_method
,
...
...
@@ -353,9 +360,11 @@ def filter_outliers(df, db, outlier_rm_method, distance_matrix, **kwargs):
rtn
=
filter_outliers_lof
(
df
,
distance_matrix
=
distance_matrix
)
elif
outlier_rm_method
==
"reclustering"
:
func
=
kwargs
.
pop
(
"reclustering_function"
)
rtn
=
func
(
df
,
distance_matrix
,
**
kwargs
)
rtn
=
func
(
df
,
config
=
config
,
distance_matrix
=
distance_matrix
,
**
kwargs
)
else
:
rtn
=
filter_outliers_iterative
(
df
,
**
kwargs
)
rtn
=
filter_outliers_iterative
(
df
,
config
=
config
)
logger
.
debug
(
" * Done with outlier removal. Elapsed: %.1fs"
,
time
.
time
()
-
tstart
,
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment