Commit ab79ab32 authored by Paulo Medeiros's avatar Paulo Medeiros
Browse files

Lint load_data.py docstrings.

parent 5c88691c
......@@ -92,7 +92,15 @@ class DataFrameWithDtgMetadata(pd.DataFrame):
@np.vectorize
def mslp2sp_coeff(alt):
"""Coeff to convert mean sea-level pressure to pressure at altitude alt."""
"""Coeff to convert mean sea-level pressure to pressure at altitude alt.
Args:
alt: Altitude in meters.
Returns:
float: Which multiplies by the mslp to give the surface pressure at alt
"""
return 1.0 / (
int(round(100000 / ((288 - 0.0065 * alt) / 288) ** 5.255)) / 100000
)
......@@ -129,7 +137,34 @@ def read_netatmo_csv(
recover_pressure_from_mslp=False,
drop_mslp=None,
):
"""Read CSV files with NetAtmo observation data."""
"""Read CSV files with NetAtmo observation data.
Args:
path (pathlib.Path): Full path to the csv file.
dropna (bool): Whether or not to exclude NaNs (Default value = True).
fillna (dict): Dictionary having data parameters as keys and the fill
values (which will replace NaNs) as dict values. The filling of
NaNs is performed prior to dropping NaNs. (Default value = None).
recover_pressure_from_mslp (bool): The netatmo csv files are assumed to
contain mslp data instead of surface pressure. This flag controls
whether surface pressure should also be included in the read data.
If True. then a new column "pressure" will be added with surface
pressure calculated from altitude and mslp.
(Default value = False)
drop_mslp (bool): Whether ot not to drop the original mslp after
having created a new "pressure" column with surface pressure.
If None is used, then drop_mslp takes the value of the
recover_pressure_from_mslp parameter.
(Default value = None)
Returns:
pandas.Dataframe: Data read from the input netatmo csv file.
Raises:
ValueError: If drop_mslp evaluates to True while
recover_pressure_from_mslp evaluates to False.
"""
if drop_mslp is None:
drop_mslp = recover_pressure_from_mslp
if drop_mslp and not recover_pressure_from_mslp:
......@@ -180,11 +215,18 @@ def read_netatmo_csv(
def gen_netatmo_fpaths_for_dates(dates, rootdir):
"""Generate netatmo file paths for dates.
Return an generator for the paths of the NetAtmo CSV files that need
to be read in order to get data for date=dt
NetAtmo CSV files are assumed to have full paths following the pattern
"rootdir/%Y/%m/%d/%Y%m%dT%H%M%SZ.csv"
"rootdir/%Y/%m/%d/%Y%m%dT%H%M%SZ.csv"
Args:
dates (list): List of datetime.datetime objects. Only year, month, and
day are used.
rootdir (pathlib.Path): Path to the top-level directory where the
netatmo input csv files are stored.
Yields:
libpath.Path: Paths to files be read in order to get data for date=dt
"""
try:
date_iter = iter(dates)
......@@ -202,9 +244,20 @@ def gen_netatmo_fpaths_for_dates(dates, rootdir):
def read_netatmo_data_for_dates(dates, rootdir, **kwargs):
"""Read netatmo data for passed dates.
Return a pd dataframe with data from NetAtmo files for given dates
Args:
dates (list): List of datetime.datetime objects. Only year, month, and
day are used.
rootdir (pathlib.Path): Path to the top-level directory where the
netatmo input csv files are stored.
**kwargs: Passed to read_netatmo_csv
Returns:
pandas.Dataframe: Data read from csv files for the dates passed
as input.
Raises:
DataNotFoundError: If data for the selected dates cannot be found.
kwargs are passed to read_netatmo_csv
"""
data_from_all_files = {}
for fpath in gen_netatmo_fpaths_for_dates(dates, rootdir):
......@@ -221,16 +274,27 @@ def read_netatmo_data_for_dates(dates, rootdir, **kwargs):
def _datetime_of_netatmo_file(fpath, fmt="%Y%m%dT%H%M%SZ.csv"):
"""Get the datetime from csv file path."""
dt = datetime.strptime(fpath.name, fmt)
return dt.replace(tzinfo=timezone("utc"))
# Reading data for a given DTG instead of date
def gen_netatmo_files_for_time_window(start, end, rootdir):
"""Generate NetAtmo files paths for datetimes within [start, end].
def gen_netatmo_fpaths_for_time_window(start, end, rootdir):
"""Generate NetAtmo csv file paths for datetimes within [start, end].
Returns a generator of paths leading to NetAtmo files whose names
correspond to timestamps lying within the closed interval [start, end].
Args:
start (datetime.datetime): Start date.
end (datetime.datetime): End date.
rootdir (pathlib.Path): Path to the top-level directory
where the netatmo input csv files are stored.
Yields:
pathlib.Path: NetAtmo csv file paths.
"""
start, end = sorted((start, end))
dates = pd.date_range(start.date(), end.date(), freq="1d")
......@@ -241,16 +305,38 @@ def gen_netatmo_files_for_time_window(start, end, rootdir):
def gen_netatmo_fpaths_for_dtg(dtg, rootdir):
"""Paths to NetAtmo files that are likely to contain data for DTG=dtg."""
"""Paths to NetAtmo files that are likely to contain data for DTG=dtg.
Args:
dtg (netatmoqc.dtgs.Dtg): Data assimilation DTG.
rootdir (pathlib.Path): Path to the top-level directory
where the netatmo input csv files are stored.
Returns:
generator: Generator of the NetAtmo csv file paths
"""
# Add a few minutes to the assimilation window, as
# netatmo data is gathered every ~10 minutes
start = dtg.cycle_start - pd.Timedelta("15 minutes")
end = dtg.cycle_end + pd.Timedelta("15 minutes")
return gen_netatmo_files_for_time_window(start, end, rootdir=rootdir)
return gen_netatmo_fpaths_for_time_window(start, end, rootdir=rootdir)
def rm_moving_stations(df):
"""Return df excluding stations that change any of (lon, lat, alt)."""
"""Remove stations that change (lon, lat, alt) from the data.
Args:
df (pandas.Dataframe): Input data.
Returns:
{
'pandas.Dataframe': Copy of input df, but with
moving stations removed,
'pandas.Dataframe': The moving stations.
}
"""
df_grouped_by_id = df.groupby(["id"], as_index=False, sort=False)
df_temp = df_grouped_by_id[["lat", "lon", "alt"]].agg(["max", "min"])
inconsistent_stations = df_temp[
......@@ -266,7 +352,16 @@ def rm_moving_stations(df):
def rm_overlapping_stations(df):
"""Return df excluding stations with different IDs but same (lon, lat)."""
"""Remove from df the stations that have the same (lon, lat).
Args:
df (pandas.Dataframe): Input data.
Returns:
'pandas.Dataframe': Copy of input df, but with the
overlapping stations removed.
"""
overlapping_stations = (
df[["id", "lat", "lon"]]
.round(6)
......@@ -284,6 +379,15 @@ def remove_duplicates_within_cycle(df, dtg):
If a station reports data multiple times within the time window,
then keep the one closest to the passed DTG and discard the rest.
Args:
df (pandas.Dataframe): Input data.
dtg (netatmoqc.dtgs.Dtg): The picked assimilation DTG.
Returns:
'pandas.Dataframe': Copy of input df, where only one station
report is kept for each DTG.
"""
# It seems that using groupby followed by "apply" with a custom
# function is about 20x slower.
......@@ -312,9 +416,18 @@ def remove_duplicates_within_cycle(df, dtg):
def read_netatmo_data_for_dtg(dtg, rootdir, **kwargs):
"""Read NetAtmo data corresponding to the selected DTG.
Return a dataframe with data from NetAtmo files for given DTG
Args:
dtg (netatmoqc.dtgs.Dtg): The picked assimilation DTG.
rootdir (pathlib.Path): Path to the top-level directory where the
netatmo input csv files are stored.
**kwargs: Passed to read_netatmo_csv
Returns:
pandas.Dataframe: Data read from csv files for the input DTG.
Raises:
DataNotFoundError: If data for the selected DTGs cannot be found.
kwargs are passed to read_netatmo_csv
"""
t_start = time.time()
logger.debug("Reading data for DTG=%s...", dtg)
......@@ -366,7 +479,27 @@ def read_netatmo_data_for_dtg(dtg, rootdir, **kwargs):
def remove_irregular_stations(
df, moving=True, duplicates=True, overlapping=True
):
"""Remove obs from irregular stations (moving, duplicates or overlaps)."""
"""Remove obs from irregular stations (moving, duplicates or overlaps).
This is a wrapper for the following routines:
* rm_moving_stations
* remove_duplicates_within_cycle
* rm_overlapping_stations
Args:
df (pandas.Dataframe): Input data.
moving (bool): Whether or not to remove moving stations.
(Default value = True)
duplicates (bool): Whether or not to remove duplicate sttaion reports.
Default value = True)
overlapping (bool): Whether or not to remove overlapping stations.
Default value = True)
Returns:
'pandas.Dataframe': Copy of input df, but with the
irregular. stations removed.
"""
dtg = df.metadata_dict["dtg"]
if moving:
# Remove stations that change (lat, lon, alt) within cycle
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment