Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
iObs
WP2
Task 2.3
netatmoqc
Commits
11e352b8
Commit
11e352b8
authored
Nov 04, 2021
by
Paulo Medeiros
Browse files
Add "haversine_plus_euclidean" metrics calc method
parent
c802e68f
Changes
2
Hide whitespace changes
Inline
Side-by-side
netatmoqc/config_parser.py
View file @
11e352b8
...
...
@@ -394,7 +394,11 @@ with config_section("metrics") as section:
config_metadata
.
register
(
"method"
,
default
=
"correlation_aware_euclidean"
,
choices
=
[
"correlation_aware_euclidean"
,
"haversine_plus_manhattan"
],
choices
=
[
"correlation_aware_euclidean"
,
"haversine_plus_manhattan"
,
"haversine_plus_euclidean"
,
],
)
config_metadata
.
register
(
"optimize_mode"
,
...
...
netatmoqc/metrics.py
View file @
11e352b8
...
...
@@ -91,10 +91,14 @@ def get_obs_norm_factors(obs_values):
return
rtn
@
njit
(
"f4[:](f8[:, :], f8[:])"
,
parallel
=
True
,
cache
=
True
)
def
calc_distance_matrix_haversine_plus
_manhattan_numba
(
df
,
weights
):
@
njit
(
"f4[:](f8[:, :], f8[:]
, types.unicode_type
)"
,
parallel
=
True
,
cache
=
True
)
def
numba_
calc_distance_matrix_haversine_plus
(
df
,
weights
,
method
):
"""Calculate distance matrix using python+numba.
Spatial distances are calculated using the haversine method.
The non-spatial part of the distance is calculated using the
method specified in "method".
Args:
df (numpy.ndarray): Multidimensional numpy array containing the data
entries, obtained from a pandas dataframe (numba doesn't work with
...
...
@@ -102,11 +106,21 @@ def calc_distance_matrix_haversine_plus_manhattan_numba(df, weights):
weights (numpy.array): Weights chosen for each observation parameter.
The weigts determine the relative importance of the observation
parameters w.r.t. each other.
method (basestring): The method to be used for the non-spatial part of
the distance.
Returns:
numpy.ndarray: Data to be used in the construction of the dist matrix.
Raises:
NotImplementedError: If method not in ["manhattan", "euclidean"].
"""
if
method
not
in
[
"manhattan"
,
"euclidean"
]:
raise
NotImplementedError
(
"'method' must be one of: manhattan, euclidean"
)
nrows
,
ncols
=
df
.
shape
# Get normalisation factors so that observations in different
...
...
@@ -139,25 +153,35 @@ def calc_distance_matrix_haversine_plus_manhattan_numba(df, weights):
i
,
j
=
np
.
zeros
(
2
,
dtype
=
np
.
int64
)
for
idist
in
prange
(
n_dists
):
# pylint: disable=not-an-iterable
i
,
j
=
_data_index_to_matrix_index
(
nrows
,
idist
,
check_bounds
=
False
)
rtn
[
idist
]
=
weights_internal
[
0
]
*
haversine_distance
(
df
[
i
],
df
[
j
]
)
+
np
.
sum
(
np
.
abs
(
weights_internal
[
1
:]
*
(
df
[
j
,
2
:]
-
df
[
i
,
2
:])))
rtn
[
idist
]
=
weights_internal
[
0
]
*
haversine_distance
(
df
[
i
],
df
[
j
])
if
method
==
"manhattan"
:
rtn
[
idist
]
+=
np
.
sum
(
np
.
abs
(
weights_internal
[
1
:]
*
(
df
[
j
,
2
:]
-
df
[
i
,
2
:]))
)
elif
method
==
"euclidean"
:
rtn
[
idist
]
+=
np
.
sqrt
(
np
.
sum
((
weights_internal
[
1
:]
*
(
df
[
j
,
2
:]
-
df
[
i
,
2
:]))
**
2
)
)
return
rtn
def
calc_distance_matrix_haversine_plus
_manhattan
(
df
,
weights
,
optimize_mode
,
num_threads
=-
1
def
calc_distance_matrix_haversine_plus
(
df
,
weights
,
method
,
optimize_mode
,
num_threads
=-
1
):
"""Calculate distance matrix between obs in dataframe df.
Spatial distances are calculated using the haversine method.
The non-spatial part of the distance is calculated using the
method specified in "method".
Args:
df (pandas.Dataframe): Input data.
weights (numpy.array): Weights chosen for each observation parameter.
The weigts determine the relative importance of the observation
parameters w.r.t. each other.
method (basestring): The method to be used for the non-spatial part of
the distance.
optimize_mode: How the distance matrix is to be calculated and stored.
This is passed onto the constructor for the class
netatmoqc.hollow_symmetric_matrix.HollowSymmetricMatrix.
...
...
@@ -168,18 +192,29 @@ def calc_distance_matrix_haversine_plus_manhattan(
netatmoqc.hollow_symmetric_matrix.HollowSymmetricMatrix: The distance
matrix.
Raises:
NotImplementedError: If method not in ["manhattan", "euclidean"].
"""
logger
.
debug
(
" > Calculating distance matrix..."
)
tstart
=
time
.
time
()
method
=
method
.
lower
()
allowed_methods
=
[
"manhattan"
,
"euclidean"
]
if
method
not
in
allowed_methods
:
raise
NotImplementedError
(
"Argument 'method' must be one of: %s"
%
(
", "
.
join
(
allowed_methods
))
)
if
num_threads
>
0
:
original_nthreads
=
numba
.
get_num_threads
()
numba
.
set_num_threads
(
num_threads
)
atexit
.
register
(
numba
.
set_num_threads
,
original_nthreads
)
rtn
=
HollowSymmetricMatrix
(
data
=
calc_distance_matrix_haversine_plus
_manhattan_numba
(
df
.
to_numpy
(),
weights
=
weights
data
=
numba_
calc_distance_matrix_haversine_plus
(
df
.
to_numpy
(),
weights
=
weights
,
method
=
method
),
optimize_mode
=
optimize_mode
,
)
...
...
@@ -333,25 +368,18 @@ def calc_distance_matrix(df, config, domain=None, num_threads=-1):
accepted_methods
=
[
"correlation_aware_euclidean"
,
"haversine_plus_manhattan"
,
"haversine_plus_euclidean"
,
]
method
=
config
.
metrics
.
method
.
lower
()
if
method
not
in
accepted_methods
:
raise
NotImplementedError
(
"Distance matrix calc method '%s' not available. "
%
(
method
)
+
"
C
hoose method from: %s"
%
(
", "
.
join
(
accepted_methods
))
+
"
Please c
hoose method from: %s"
%
(
", "
.
join
(
accepted_methods
))
)
logger
.
debug
(
"Computing distance matrix using the '%s' method"
,
method
)
weights_dict
=
config
.
get_clustering_opt
(
"obs_weights"
)
df
=
df
.
copy
().
drop
(
config
.
general
.
unclusterable_data_columns
,
axis
=
1
)
if
method
==
"haversine_plus_manhattan"
:
return
calc_distance_matrix_haversine_plus_manhattan
(
# Drop columns that won't be used in the clustering
df
=
df
,
weights
=
weights_dict_to_np_array
(
df
,
config
=
config
),
optimize_mode
=
config
.
metrics
.
optimize_mode
,
num_threads
=
num_threads
,
)
if
method
==
"correlation_aware_euclidean"
:
if
domain
is
None
:
...
...
@@ -367,3 +395,12 @@ def calc_distance_matrix(df, config, domain=None, num_threads=-1):
num_threads
=
num_threads
,
domain
=
domain
,
)
else
:
return
calc_distance_matrix_haversine_plus
(
# Drop columns that won't be used in the clustering
df
=
df
,
weights
=
weights_dict_to_np_array
(
df
,
config
=
config
),
method
=
method
.
replace
(
"haversine_plus_"
,
""
),
optimize_mode
=
config
.
metrics
.
optimize_mode
,
num_threads
=
num_threads
,
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment