Source code for iccas.loading

import datetime
from pathlib import Path
from typing import Optional, Tuple
from urllib.parse import urljoin

import pandas as pd
from iccas._caching import RemoteFolderProxy
from iccas.types import DateLike, PathType

BASE_URL = (
    "https://raw.githubusercontent.com/janLuke/iccas-dataset/master/data/"
)
AVAILABLE_FORMATS = {"csv"}
DEFAULT_CACHE_DIR = Path("~/.iccas").expanduser()
POPULATION_BY_AGE_PATH = "util/italian_population_by_age_2020.csv"
POPULATION_BY_AGE_GROUP_PATH = "util/italian_population_by_age_group_2020.csv"


def _check_file_format(fmt: str):
    if fmt not in AVAILABLE_FORMATS:
        raise ValueError(f"unavailable format: {fmt}")


def _date_as_str(date: DateLike) -> str:
    if isinstance(date, str):
        return date
    if isinstance(date, datetime.date):
        return date.strftime("%Y-%m-%d")
    raise TypeError


def _get_dataset_path(date: Optional[DateLike] = None, fmt: str = "csv") -> str:
    if date:
        date_str = _date_as_str(date)
        return "by-date/iccas_{}.{}".format(date_str, fmt)
    return "iccas_full.{}".format(fmt)


[docs]def get_url(date: Optional[DateLike] = None, fmt: str = "csv") -> str: """Returns the url of a dataset in a given format. If `date` is None, returns the URL of the full dataset.""" _check_file_format(fmt) return urljoin(BASE_URL, _get_dataset_path(date=date, fmt=fmt))
[docs]def load(path: PathType) -> pd.DataFrame: dataset = pd.read_csv(path, index_col=("date", "age_group"), parse_dates=["date"]) return dataset.unstack(level="age_group")
[docs]def load_single_date(path: PathType, keep_date: bool = False) -> pd.DataFrame: """ Loads a dataset containing data for a single date. By default (`keep_date=False`), the `date` column is dropped and the datetime is stored in the `attrs` of the DataFrame. If instead `keep_date=True`, the returned dataset has a MultiIndex `(date, age_group)`. Args: path: keep_date: whether to drop the date column (containing a single datetime value) """ index_col = ("date", "age_group") if keep_date else "age_group" dataset = pd.read_csv(path, index_col=index_col, parse_dates=["date"]) if not keep_date: date = dataset.date.iloc[0] dataset = dataset.drop(columns="date") dataset.attrs["date"] = date return dataset
[docs]def get(cache_dir: PathType = DEFAULT_CACHE_DIR) -> pd.DataFrame: """ Returns the latest version of the ICCAS dataset in a :class:`pandas.DataFrame` (as it's returned by :func:`load`). This function uses :meth:`RemoteFolderCache.get`, which caches. Raises: :exc:`request.exceptions.ConnectionError`: if the server is unreachable and no dataset is available in `cache_dir` """ data_proxy = RemoteFolderProxy(folder_url=BASE_URL, local_path=cache_dir) path = data_proxy.get(_get_dataset_path()) return load(path)
[docs]def get_by_date( date: DateLike, keep_date: bool = False, cache_dir: PathType = DEFAULT_CACHE_DIR, ) -> Tuple[pd.DataFrame, pd.Timestamp]: data_proxy = RemoteFolderProxy(folder_url=BASE_URL, local_path=cache_dir) path = data_proxy.get(_get_dataset_path(date=date)) return load_single_date(path, keep_date=keep_date)
[docs]def get_population_by_age(cache_dir: PathType = DEFAULT_CACHE_DIR) -> pd.DataFrame: """ Returns a DataFrame with "age" as index and two columns: "value" (absolute counts) and "percentage" (<=1.0) """ data_proxy = RemoteFolderProxy(folder_url=BASE_URL, local_path=cache_dir) path = data_proxy.get(POPULATION_BY_AGE_PATH) pop = pd.read_csv(path, index_col=0) pop['percentage'] = pop.value / pop.value.sum() return pop
[docs]def get_population_by_age_group( cache_dir: PathType = DEFAULT_CACHE_DIR, ) -> pd.DataFrame: """ Returns a DataFrame with "age_group" as index and two columns: "value" (absolute counts) and "percentage" (<=1.0) """ data_proxy = RemoteFolderProxy(folder_url=BASE_URL, local_path=cache_dir) path = data_proxy.get(POPULATION_BY_AGE_GROUP_PATH) pop = pd.read_csv(path, index_col=0) pop['percentage'] = pop.value / pop.value.sum() return pop