Commit d820791a authored by Paulo Medeiros's avatar Paulo Medeiros
Browse files

New "thin" command

This is a merge.
parents aeded9d6 52a50521
Pipeline #9438 failed with stages
in 53 seconds
......@@ -12,6 +12,7 @@ from .commands_functions import (
csv2obsoul,
select_stations,
show,
thin_data_from_csv_files,
)
......@@ -196,6 +197,29 @@ def get_parsed_args(program_name):
parser_csv2obsoul.set_defaults(func=csv2obsoul)
###########################################
# Configure parser for the "thin" command #
###########################################
parser_thin = subparsers.add_parser(
"thin",
help=(
"Thin data from input file(s) using the thinning configs "
"specified for the adopted domain."
),
)
parser_thin.add_argument(
"paths",
nargs="*",
type=Path,
default=list(Path(".").glob("*.csv")),
help=(
"Path(s) to input CSVs containing at least (lon, lat) data. "
"Directory paths are also accepted, in which case they will be "
"recursively searched for CSV files."
),
)
parser_thin.set_defaults(func=thin_data_from_csv_files)
###########################################
# Configure parser for the "apps" command #
###########################################
......
......@@ -36,11 +36,14 @@ from .plots import (
make_clustering_fig,
show_cmd_get_fig_from_dataframes,
)
from .save_data import netatmoqc_input2output
from .save_data import netatmoqc_input2output, save_df_as_netatmo_csv
logger = logging.getLogger(__name__)
#########################################
# Code related to the "cluster" command #
#########################################
def cluster_obs_single_dtg(args):
"""Implement the "cluster" command.
......@@ -104,6 +107,9 @@ def cluster_obs_single_dtg(args):
)
########################################
# Code related to the "select" command #
########################################
def _select_stations_single_dtg(dtg, config, args):
"""Implement "select_stations" for a single DTG.
......@@ -333,21 +339,14 @@ def select_stations(args):
df_accepted["rejection_rate"].sort_values(ascending=True).index
]
# (c) Add grid (i, j) info
icol, jcol = domain.thinning_grid.lonlat2grid(
df_accepted["lon"].to_numpy(), df_accepted["lat"].to_numpy(),
# (c) Thin data keeping only the first entry found at each (i, j).
# As we've sorted by rejection rate (lower to higher), all but the
# lowest-rejection-rate station at each grid (i, j) will be kept.
grid_trimmed_stations = domain.thinning_grid.thin_obs(
df_accepted, method="first"
)
df_accepted["i"] = icol
df_accepted["j"] = jcol
# (d) rm all but the lowest-rejection-rate station at each grid (i, j)
grid_trimmed_stations = df_accepted.groupby(
["i", "j"], as_index=False, sort=False
).first()
grid_trimmed_stations = grid_trimmed_stations.drop(["i", "j"], axis=1)
df_accepted = df_accepted.drop(["i", "j"], axis=1)
# (e) Finally, move to df_rejected those stations that appear in
# (d) Finally, move to df_rejected those stations that appear in
# df_accepted but not in grid_trimmed_stations
grid_trimmed_mask = df_accepted["id"].isin(grid_trimmed_stations["id"])
df_rejected = df_rejected.append(df_accepted[~grid_trimmed_mask])
......@@ -438,7 +437,9 @@ def select_stations(args):
)
# Code related to the csv2obsoul command
############################################
# Code related to the "csv2obsoul" command #
############################################
def csv2obsoul(args):
"""Implement the "csv2obsoul" command.
......@@ -493,7 +494,68 @@ def csv2obsoul(args):
)
# Code related to the "show" command
######################################
# Code related to the "thin" command #
######################################
def thin_data_from_csv_files(args):
"""Implement the 'thin' command.
Args:
args (argparse.Namespace): Parsed command line arguments.
"""
logger = get_logger(__name__, args.loglevel)
config = read_config(args.config_file)
domain = Domain.construct_from_dict(config.domain)
outdir_prefix = config.general.outdir / "{}_netatmoqc_thin".format(
datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
)
# Parse input paths. Keep file paths as they are, and find csv files
# recursively for paths that are directories.
file_list = []
for path in args.paths:
if path.is_dir():
file_list += list(path.rglob("*.csv"))
else:
file_list.append(path)
for fpath in file_list:
if fpath.suffix != ".csv":
logger.warning(
"Only csv files supported. Skipping file '%s'", fpath
)
continue
logger.info("Parsing data from file %s", fpath)
logger.debug("Read data from file %s", fpath)
try:
df = pd.read_csv(fpath)
except FileNotFoundError:
logger.warning("File %s not found.", fpath)
continue
except pd.errors.EmptyDataError:
logger.warning('No data in file "%s"', fpath)
continue
logger.debug("Thin data from file %s", fpath)
df = domain.trim_obs(df)
df = domain.thinning_grid.thin_obs(df, method="nearest")
# Save results
outdir = outdir_prefix / fpath.parent.relative_to(fpath.anchor)
outdir.mkdir(parents=True, exist_ok=True)
out_fpath = outdir / fpath.name
logger.info("Saving thinned data to file %s\n", out_fpath)
save_df_as_netatmo_csv(df, out_fpath, overwrite=True)
######################################
# Code related to the "show" command #
######################################
def _open_file_with_default_app(fpath):
if platform.system() == "Windows":
os.startfile(fpath)
......
......@@ -7,6 +7,7 @@ import attr
import numpy as np
import pyproj
from .metrics import haversine_distance
from .plots import DEF_FIGSHOW_CONFIG, get_domain_fig
logger = logging.getLogger(__name__)
......@@ -295,6 +296,47 @@ class DomainGrid(Grid2D):
"""Return tuple of (lon, lat) coords of the extension zone corners."""
return tuple(self.proj.xy2lonlat(*xy) for xy in self.ezone_corners)
def thin_obs(self, df, method="nearest"):
"""Return df with only one entry per grid point."""
if method not in ["nearest", "first"]:
raise NotImplementedError(
"'method' must be one of: 'nearest', 'first'"
)
if len(df.index) == 0:
return df
# Add grid (i, j) info
icol, jcol = self.lonlat2grid(
df["lon"].to_numpy(), df["lat"].to_numpy()
)
df["i"] = icol
df["j"] = jcol
if method == "nearest":
# Get (lon, lat) for grid points
g2lon, g2lat = self.ij2lonlat_map()
# Sort data by distance to nearest grid point
def _dist(lon, lat, i, j):
p0 = np.array([lon, lat])
p1 = np.array([g2lon[i, j], g2lat[i, j]])
return haversine_distance(p0, p1)
df = df.loc[
df.apply(lambda x: _dist(x.lon, x.lat, x.i, x.j), axis=1)
.sort_values(ascending=True)
.index
]
# Remove all but one (the first) entry at each grid (i, j)
df = df.groupby(["i", "j"], as_index=False, sort=False).first()
# Remove no longer needed (i, j) info
df = df.drop(["i", "j"], axis=1)
return df
class Domain:
"""Model domain geometry and grid.
......
[build-system]
build-backend = "poetry.core.masonry.api"
requires = [
"poetry-core >= 1.0.0",
"numpy >= 1.19.0",
]
[tool.poetry]
name = "netatmoqc"
version = "0.3.5"
version = "0.3.6"
description = "Use machine learning clustering methods to perform quality control over NetAtmo data"
authors = [
"Paulo V. C. Medeiros <paulo.medeiros@smhi.se>"
......@@ -20,6 +13,13 @@
[tool.poetry.scripts]
netatmoqc = "netatmoqc.main:main"
[build-system]
build-backend = "poetry.core.masonry.api"
requires = [
"poetry-core >= 1.0.0",
"numpy >= 1.19.0",
]
[tool.poetry.dependencies]
attrs = "^20.2.0"
dash = "^1.13.4"
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment