Commit b205df5b authored by Paulo Medeiros's avatar Paulo Medeiros
Browse files

More refactoring in metrics.py

parent cb5ca9c5
......@@ -37,10 +37,11 @@ def weights_dict_to_np_array(df, config, default=1):
"""Convert pairwise_diff_weights into a numpy array.
Takes a pandas dataframe and a {column_name:weight} dictionary and returns
an array of weights as needed in the calc_distance_matrix_haversine_plus
routine.
an array of weights ordered in a way consistent with the order of the data
used in the routines to calculate distance matrices.
Columns "lat" and "lon" in df are treated specially, in that they are
If using "haversine_plus_" distance matrix calculation, methods then the
columns "lat" and "lon" in df are treated specially, in that they are
not assigned a weight individually, but rather a single weight gets
assigned to the "geo_dist" property.
......@@ -60,22 +61,29 @@ def weights_dict_to_np_array(df, config, default=1):
column.
"""
if df.columns.get_loc("lon") - df.columns.get_loc("lat") != 1:
raise ValueError("'lat' column is not followed by 'lon' column")
weights = []
weights_dict = config.get_clustering_opt("obs_weights")
unclusterable_cols = config.general.unclusterable_data_columns
col2weight = {c: ("geo_dist" if c == "lon" else c) for c in df.columns}
for col in df.columns[~df.columns.isin(unclusterable_cols + ["lat"])]:
try:
weights.append(weights_dict[col2weight[col]])
except (KeyError, TypeError):
weights.append(default)
weights = np.array(weights, dtype=np.float64)
weights = np.where(weights < 0, 0.0, weights)
if config.metrics.method.lower().startswith("haversine_plus_"):
if df.columns.get_loc("lon") - df.columns.get_loc("lat") != 1:
raise ValueError("'lat' column is not followed by 'lon' column")
weights = []
col2weight = {c: ("geo_dist" if c == "lon" else c) for c in df.columns}
for col in df.columns[~df.columns.isin(unclusterable_cols + ["lat"])]:
try:
weights.append(weights_dict[col2weight[col]])
except (KeyError, TypeError):
weights.append(default)
weights = np.array(weights, dtype=np.float64)
else:
# x and y should have their individual weights
unclusterable_cols = [
c for c in unclusterable_cols if c not in ["x", "y"]
]
selected_cols = [c for c in df.columns if c not in unclusterable_cols]
weights = np.array([weights_dict.get(c, 1.0) for c in selected_cols])
weights = np.where(weights < 0, 0.0, weights)
return weights
......@@ -185,7 +193,7 @@ def calc_distance_matrix_haversine_plus(df, config):
return HollowSymmetricMatrix(
data=numba_calc_distance_matrix_haversine_plus(
df.to_numpy(),
df=df.to_numpy(),
weights_array=weights_dict_to_np_array(df, config=config),
method=method,
),
......@@ -291,17 +299,14 @@ def calc_distance_matrix_considering_correlation(df, config, domain):
df.insert(0, "x", xvals / 1000.0)
df = df.drop(["lon", "lat"], axis=1)
# Make sure weights and df columns are consistent
weights_dict = config.get_clustering_opt("obs_weights")
weights_array = np.array([weights_dict.get(c, 1.0) for c in df.columns])
weights_array = np.where(weights_array < 0, 0.0, weights_array)
covariance_matrix = df.corr().fillna(0)
np.fill_diagonal(covariance_matrix.values, 1.0)
distance_matrix_data = (
calc_distance_matrix_data_considering_correlation_numba(
df.to_numpy(), weights_array, covariance_matrix.to_numpy()
df=df.to_numpy(),
weights_array=weights_dict_to_np_array(df, config=config),
covariance_matrix=covariance_matrix.to_numpy(),
)
)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment