Commit 11e352b8 authored by Paulo Medeiros's avatar Paulo Medeiros
Browse files

Add "haversine_plus_euclidean" metrics calc method

parent c802e68f
......@@ -394,7 +394,11 @@ with config_section("metrics") as section:
config_metadata.register(
"method",
default="correlation_aware_euclidean",
choices=["correlation_aware_euclidean", "haversine_plus_manhattan"],
choices=[
"correlation_aware_euclidean",
"haversine_plus_manhattan",
"haversine_plus_euclidean",
],
)
config_metadata.register(
"optimize_mode",
......
......@@ -91,10 +91,14 @@ def get_obs_norm_factors(obs_values):
return rtn
@njit("f4[:](f8[:, :], f8[:])", parallel=True, cache=True)
def calc_distance_matrix_haversine_plus_manhattan_numba(df, weights):
@njit("f4[:](f8[:, :], f8[:], types.unicode_type)", parallel=True, cache=True)
def numba_calc_distance_matrix_haversine_plus(df, weights, method):
"""Calculate distance matrix using python+numba.
Spatial distances are calculated using the haversine method.
The non-spatial part of the distance is calculated using the
method specified in "method".
Args:
df (numpy.ndarray): Multidimensional numpy array containing the data
entries, obtained from a pandas dataframe (numba doesn't work with
......@@ -102,11 +106,21 @@ def calc_distance_matrix_haversine_plus_manhattan_numba(df, weights):
weights (numpy.array): Weights chosen for each observation parameter.
The weigts determine the relative importance of the observation
parameters w.r.t. each other.
method (basestring): The method to be used for the non-spatial part of
the distance.
Returns:
numpy.ndarray: Data to be used in the construction of the dist matrix.
Raises:
NotImplementedError: If method not in ["manhattan", "euclidean"].
"""
if method not in ["manhattan", "euclidean"]:
raise NotImplementedError(
"'method' must be one of: manhattan, euclidean"
)
nrows, ncols = df.shape
# Get normalisation factors so that observations in different
......@@ -139,25 +153,35 @@ def calc_distance_matrix_haversine_plus_manhattan_numba(df, weights):
i, j = np.zeros(2, dtype=np.int64)
for idist in prange(n_dists): # pylint: disable=not-an-iterable
i, j = _data_index_to_matrix_index(nrows, idist, check_bounds=False)
rtn[idist] = weights_internal[0] * haversine_distance(
df[i], df[j]
) + np.sum(np.abs(weights_internal[1:] * (df[j, 2:] - df[i, 2:])))
rtn[idist] = weights_internal[0] * haversine_distance(df[i], df[j])
if method == "manhattan":
rtn[idist] += np.sum(
np.abs(weights_internal[1:] * (df[j, 2:] - df[i, 2:]))
)
elif method == "euclidean":
rtn[idist] += np.sqrt(
np.sum((weights_internal[1:] * (df[j, 2:] - df[i, 2:])) ** 2)
)
return rtn
def calc_distance_matrix_haversine_plus_manhattan(
df, weights, optimize_mode, num_threads=-1
def calc_distance_matrix_haversine_plus(
df, weights, method, optimize_mode, num_threads=-1
):
"""Calculate distance matrix between obs in dataframe df.
Spatial distances are calculated using the haversine method.
The non-spatial part of the distance is calculated using the
method specified in "method".
Args:
df (pandas.Dataframe): Input data.
weights (numpy.array): Weights chosen for each observation parameter.
The weigts determine the relative importance of the observation
parameters w.r.t. each other.
method (basestring): The method to be used for the non-spatial part of
the distance.
optimize_mode: How the distance matrix is to be calculated and stored.
This is passed onto the constructor for the class
netatmoqc.hollow_symmetric_matrix.HollowSymmetricMatrix.
......@@ -168,18 +192,29 @@ def calc_distance_matrix_haversine_plus_manhattan(
netatmoqc.hollow_symmetric_matrix.HollowSymmetricMatrix: The distance
matrix.
Raises:
NotImplementedError: If method not in ["manhattan", "euclidean"].
"""
logger.debug(" > Calculating distance matrix...")
tstart = time.time()
method = method.lower()
allowed_methods = ["manhattan", "euclidean"]
if method not in allowed_methods:
raise NotImplementedError(
"Argument 'method' must be one of: %s"
% (", ".join(allowed_methods))
)
if num_threads > 0:
original_nthreads = numba.get_num_threads()
numba.set_num_threads(num_threads)
atexit.register(numba.set_num_threads, original_nthreads)
rtn = HollowSymmetricMatrix(
data=calc_distance_matrix_haversine_plus_manhattan_numba(
df.to_numpy(), weights=weights
data=numba_calc_distance_matrix_haversine_plus(
df.to_numpy(), weights=weights, method=method
),
optimize_mode=optimize_mode,
)
......@@ -333,25 +368,18 @@ def calc_distance_matrix(df, config, domain=None, num_threads=-1):
accepted_methods = [
"correlation_aware_euclidean",
"haversine_plus_manhattan",
"haversine_plus_euclidean",
]
method = config.metrics.method.lower()
if method not in accepted_methods:
raise NotImplementedError(
"Distance matrix calc method '%s' not available. " % (method)
+ "Choose method from: %s" % (", ".join(accepted_methods))
+ "Please choose method from: %s" % (", ".join(accepted_methods))
)
logger.debug("Computing distance matrix using the '%s' method", method)
weights_dict = config.get_clustering_opt("obs_weights")
df = df.copy().drop(config.general.unclusterable_data_columns, axis=1)
if method == "haversine_plus_manhattan":
return calc_distance_matrix_haversine_plus_manhattan(
# Drop columns that won't be used in the clustering
df=df,
weights=weights_dict_to_np_array(df, config=config),
optimize_mode=config.metrics.optimize_mode,
num_threads=num_threads,
)
if method == "correlation_aware_euclidean":
if domain is None:
......@@ -367,3 +395,12 @@ def calc_distance_matrix(df, config, domain=None, num_threads=-1):
num_threads=num_threads,
domain=domain,
)
else:
return calc_distance_matrix_haversine_plus(
# Drop columns that won't be used in the clustering
df=df,
weights=weights_dict_to_np_array(df, config=config),
method=method.replace("haversine_plus_", ""),
optimize_mode=config.metrics.optimize_mode,
num_threads=num_threads,
)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment