Source code for iccas.processing

import re
from typing import Union

import numpy as np
import pandas as pd
from pandas.core.dtypes.common import is_integer_dtype

from iccas.queries import cols, only_counts
from iccas.types import PandasObj


[docs]def reindex_by_interpolating(
    data: PandasObj,
    new_index: pd.DatetimeIndex,
    preserve_ints: bool = True,
    method="pchip",
    **interpolation,
) -> PandasObj:
    """
    Reindexes `data` and fills new values by interpolation (PCHIP, by default).

    This function was motivated by the fact that :meth:`pandas.DataFrame.resample`
    followed by :meth:`pandas.DataFrame.resample` doesn't take into account
    misaligned datetimes.

    Args:
        data:
            a DataFrame or Series with a datetime index
        new_index:
        preserve_ints:
            after interpolation, columns containing integers in the original
            dataframe are rounded and converted back to int
        method:
            interpolation method (see :meth:`pandas.DataFrame.interpolate`)
        **interpolation:
            other interpolation keyword argument different from `method` passed
            to :meth:`pandas.DataFrame.interpolate`

    Returns:
        a new Dataframe/Series

    See Also:
        :func:`reindex_by_interpolating`
    """
    extended_index = data.index.union(new_index)
    out = data.reindex(extended_index)
    out = out.interpolate(method=method, **interpolation)  # fill NaNs
    out = out.loc[new_index]
    if preserve_ints:
        if isinstance(data, pd.DataFrame):
            int_cols = [
                col for col, dtype in data.dtypes.items() if is_integer_dtype(dtype)
            ]
            out.loc[:, int_cols] = out.loc[:, int_cols].round().astype(int)
        elif is_integer_dtype(data):
            out = out.round().astype(int)

    out.index.rename("date", inplace=True)
    return out


[docs]def resample(
    data: PandasObj,
    freq: Union[int, str] = "1D",
    hour: int = 18,
    preserve_ints: bool = True,
    method="pchip",
    **interpolation,
) -> PandasObj:
    """
    Resamples `data` and fills missing values by interpolation.

    The resulting index is a `pandas.DatetimeIndex` whose elements are spaced by
    accordingly to `freq` and having the time set to `{hour}:00`.

    In the case of "day frequencies" ('{num}D'), the index always includes the
    latest date (`data.index[-1]`): the new index is a datetime range built
    going backwards from the latest date.

    This function was motivated by the fact that :meth:`pandas.DataFrame.resample`
    followed by :meth:`pandas.DataFrame.resample` doesn't take into account
    misaligned datetimes. If you want to back-fill or forward-fill, just use
    :meth:`DataFrame.resample`.

    Args:
        data:
            a DataFrame or Series with a datetime index
        freq:
            resampling frequency in `pandas` notation
        hour:
            reference hour; all datetimes in the new index will have this hour
        preserve_ints:
            after interpolation, columns containing integers in the original
            dataframe are rounded and converted back to int
        method:
            interpolation method (see :meth:`pandas.DataFrame.interpolate`)
        **interpolation:
            other interpolation keyword argument different from `method` passed
            to :meth:`pandas.DataFrame.interpolate`

    Returns:
        a new Dataframe/Series with index elements spaced according to ``freq``

    See Also:
        :func:`reindex_by_interpolating`
    """
    if isinstance(freq, int):
        freq = f"{freq}D"
    else:
        freq = freq.upper()

    match = re.fullmatch(r"(\d*)D", freq)
    if match:
        new_index = pd.date_range(
            data.index[-1].replace(hour=hour, minute=0),
            data.index[0].replace(hour=hour, minute=0),
            freq="-1D" if freq == "D" else f"-{freq}",
        )[::-1]
    else:
        new_index = pd.date_range(
            data.index[0],
            data.index[-1].replace(hour=hour, minute=0),
            freq=freq,
        )
    return reindex_by_interpolating(
        data, new_index, preserve_ints=preserve_ints, method=method, **interpolation
    )


# ===========================
#   DATA CORRECTION
# ===========================
[docs]def nullify_series_local_bumps(series: pd.Series):
    """ Set to NaN all elements s[i] such that s[i] > s[i+k] """
    curr_min = np.inf
    bad = set()
    for i in reversed(series.index):
        if series[i] > curr_min:
            bad.add(i)
        else:
            curr_min = series[i]
    return series.mask(series.index.isin(bad))


[docs]def nullify_local_bumps(df: pd.DataFrame):
    def f(series):
        if "unknown" in series.name:
            return series
        return nullify_series_local_bumps(series)

    return df.apply(f)


[docs]def fix_monotonicity(data: pd.DataFrame, method="pchip", **interpolation):
    """
    Replaces tracts of "cases" and "deaths" time series that break the monotonicity
    of the series with interpolated data, ensuring that the sum of male and female
    counts are less or equal to the total count.

    Args:
        data: a DataFrame containing all integer columns about cases and deaths
        method: interpolation method

    Returns:

    """
    # Fix all individual series independently
    orig = only_counts(data)
    fixed = (
        nullify_local_bumps(orig)
        .interpolate(method=method, **interpolation)
        .round()
        .astype(int)
    )

    # Ensure that (males + females <= total) for each age group (including
    # "unknown age" group) taking the maximum between the fixed totals and the
    # fixed males+females
    for variable in ["deaths", "cases"]:
        males_plus_females = fixed[cols("mf", variable)].sum(axis=1, level=1)
        totals = fixed[variable]
        totals_fixed = (
            totals
            .where(totals >= males_plus_females)
            .fillna(males_plus_females)
        )
        fixed[variable] = totals_fixed

    return fixed.astype(int)