utils.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| default_exp utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| hide\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Example Data\n",
    "\n",
    "> The `core.NeuralForecast` class allows you to efficiently fit multiple `NeuralForecast` models for large sets of time series. It operates with pandas DataFrame `df` that identifies individual series and datestamps with the `unique_id` and `ds` columns, and the `y` column denotes the target time series variable. To assist development, we declare useful datasets that we use throughout all `NeuralForecast`'s unit tests.<br><br>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "import random\n",
    "from itertools import chain\n",
    "from typing import List\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| hide\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from nbdev.showdoc import add_docs, show_doc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Synthetic Panel Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "def generate_series(n_series: int,\n",
    "                    freq: str = 'D',\n",
    "                    min_length: int = 50,\n",
    "                    max_length: int = 500,\n",
    "                    n_temporal_features: int = 0,\n",
    "                    n_static_features: int = 0,\n",
    "                    equal_ends: bool = False,\n",
    "                    seed: int = 0) -> pd.DataFrame:\n",
    "    \"\"\"Generate Synthetic Panel Series.\n",
    "\n",
    "    Generates `n_series` of frequency `freq` of different lengths in the interval [`min_length`, `max_length`].\n",
    "    If `n_temporal_features > 0`, then each serie gets temporal features with random values.\n",
    "    If `n_static_features > 0`, then a static dataframe is returned along the temporal dataframe.\n",
    "    If `equal_ends == True` then all series end at the same date.\n",
    "\n",
    "    **Parameters:**<br>\n",
    "    `n_series`: int, number of series for synthetic panel.<br>\n",
    "    `min_length`: int, minimal length of synthetic panel's series.<br>\n",
    "    `max_length`: int, minimal length of synthetic panel's series.<br>\n",
    "    `n_temporal_features`: int, default=0, number of temporal exogenous variables for synthetic panel's series.<br>\n",
    "    `n_static_features`: int, default=0, number of static exogenous variables for synthetic panel's series.<br>\n",
    "    `equal_ends`: bool, if True, series finish in the same date stamp `ds`.<br>\n",
    "    `freq`: str, frequency of the data, [panda's available frequencies](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases).<br>\n",
    "\n",
    "    **Returns:**<br>\n",
    "    `freq`: pandas.DataFrame, synthetic panel with columns [`unique_id`, `ds`, `y`] and exogenous.\n",
    "    \"\"\"\n",
    "    seasonalities = {'D': 7, 'M': 12}\n",
    "    season = seasonalities[freq]\n",
    "\n",
    "    rng = np.random.RandomState(seed)\n",
    "    series_lengths = rng.randint(min_length, max_length + 1, n_series)\n",
    "    total_length = series_lengths.sum()\n",
    "\n",
    "    dates = pd.date_range('2000-01-01', periods=max_length, freq=freq).values\n",
    "    uids = [\n",
    "        np.repeat(i, serie_length) for i, serie_length in enumerate(series_lengths)\n",
    "    ]\n",
    "    if equal_ends:\n",
    "        ds = [dates[-serie_length:] for serie_length in series_lengths]\n",
    "    else:\n",
    "        ds = [dates[:serie_length] for serie_length in series_lengths]\n",
    "\n",
    "    y = np.arange(total_length) % season + rng.rand(total_length) * 0.5\n",
    "    temporal_df = pd.DataFrame(dict(unique_id=chain.from_iterable(uids),\n",
    "                                    ds=chain.from_iterable(ds),\n",
    "                                    y=y))\n",
    "\n",
    "    random.seed(seed)\n",
    "    for i in range(n_temporal_features):\n",
    "        random.seed(seed)\n",
    "        temporal_values = [\n",
    "            [random.randint(0, 100)] * serie_length for serie_length in series_lengths\n",
    "        ]\n",
    "        temporal_df[f'temporal_{i}'] = np.hstack(temporal_values)\n",
    "        temporal_df[f'temporal_{i}'] = temporal_df[f'temporal_{i}'].astype('category')\n",
    "        if i == 0:\n",
    "            temporal_df['y'] = temporal_df['y'] * \\\n",
    "                                  (1 + temporal_df[f'temporal_{i}'].cat.codes)\n",
    "\n",
    "    temporal_df['unique_id'] = temporal_df['unique_id'].astype('category')\n",
    "    temporal_df['unique_id'] = temporal_df['unique_id'].cat.as_ordered()\n",
    "\n",
    "    if n_static_features > 0:\n",
    "        static_features = np.random.uniform(low=0.0, high=1.0, \n",
    "                        size=(n_series, n_static_features))\n",
    "        static_df = pd.DataFrame.from_records(static_features, \n",
    "                           columns = [f'static_{i}'for i in  range(n_static_features)])\n",
    "        \n",
    "        static_df['unique_id'] = np.arange(n_series)\n",
    "        static_df['unique_id'] = static_df['unique_id'].astype('category')\n",
    "        static_df['unique_id'] = static_df['unique_id'].cat.as_ordered()\n",
    "\n",
    "        return temporal_df, static_df\n",
    "\n",
    "    return temporal_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "show_doc(generate_series, title_level=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "synthetic_panel = generate_series(n_series=2)\n",
    "synthetic_panel.groupby('unique_id').head(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "temporal_df, static_df = generate_series(n_series=1000, n_static_features=2,\n",
    "                                         n_temporal_features=4, equal_ends=False)\n",
    "static_df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. AirPassengers Data\n",
    "\n",
    "The classic Box & Jenkins airline data. Monthly totals of international airline passengers, 1949 to 1960.\n",
    "\n",
    "It has been used as a reference on several forecasting libraries, since it is a series that shows clear trends and seasonalities it offers a nice opportunity to quickly showcase a model's predictions performance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "AirPassengers = np.array([112., 118., 132., 129., 121., 135., 148., 148., 136., 119., 104.,\n",
    "                          118., 115., 126., 141., 135., 125., 149., 170., 170., 158., 133.,\n",
    "                          114., 140., 145., 150., 178., 163., 172., 178., 199., 199., 184.,\n",
    "                          162., 146., 166., 171., 180., 193., 181., 183., 218., 230., 242.,\n",
    "                          209., 191., 172., 194., 196., 196., 236., 235., 229., 243., 264.,\n",
    "                          272., 237., 211., 180., 201., 204., 188., 235., 227., 234., 264.,\n",
    "                          302., 293., 259., 229., 203., 229., 242., 233., 267., 269., 270.,\n",
    "                          315., 364., 347., 312., 274., 237., 278., 284., 277., 317., 313.,\n",
    "                          318., 374., 413., 405., 355., 306., 271., 306., 315., 301., 356.,\n",
    "                          348., 355., 422., 465., 467., 404., 347., 305., 336., 340., 318.,\n",
    "                          362., 348., 363., 435., 491., 505., 404., 359., 310., 337., 360.,\n",
    "                          342., 406., 396., 420., 472., 548., 559., 463., 407., 362., 405.,\n",
    "                          417., 391., 419., 461., 472., 535., 622., 606., 508., 461., 390.,\n",
    "                          432.], dtype=np.float32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "AirPassengersDF = pd.DataFrame({'unique_id': np.ones(len(AirPassengers)),\n",
    "                                'ds': pd.date_range(start='1949-01-01',\n",
    "                                                    periods=len(AirPassengers), freq=pd.offsets.MonthEnd()),\n",
    "                                'y': AirPassengers})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "AirPassengersDF.head(12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#We are going to plot the ARIMA predictions, and the prediction intervals.\n",
    "fig, ax = plt.subplots(1, 1, figsize = (20, 7))\n",
    "plot_df = AirPassengersDF.set_index('ds')\n",
    "\n",
    "plot_df[['y']].plot(ax=ax, linewidth=2)\n",
    "ax.set_title('AirPassengers Forecast', fontsize=22)\n",
    "ax.set_ylabel('Monthly Passengers', fontsize=20)\n",
    "ax.set_xlabel('Timestamp [t]', fontsize=20)\n",
    "ax.legend(prop={'size': 15})\n",
    "ax.grid()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_static_features = 3\n",
    "n_series = 5\n",
    "\n",
    "static_features = np.random.uniform(low=0.0, high=1.0, \n",
    "                        size=(n_series, n_static_features))\n",
    "static_df = pd.DataFrame.from_records(static_features, \n",
    "                   columns = [f'static_{i}'for i in  range(n_static_features)])\n",
    "static_df['unique_id'] = np.arange(n_series)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "static_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Panel AirPassengers Data\n",
    "\n",
    "Extension to classic Box & Jenkins airline data. Monthly totals of international airline passengers, 1949 to 1960.\n",
    "\n",
    "It includes two series with static, temporal and future exogenous variables, that can help to explore the performance of models like `NBEATSx` and `TFT`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "\n",
    "# Declare Panel Data\n",
    "unique_id = np.concatenate([['Airline1']*len(AirPassengers), ['Airline2']*len(AirPassengers)])\n",
    "ds = np.tile(\n",
    "    pd.date_range(\n",
    "        start='1949-01-01', periods=len(AirPassengers), freq=pd.offsets.MonthEnd()\n",
    "    ).to_numpy(), \n",
    "    2,\n",
    ")\n",
    "y = np.concatenate([AirPassengers, AirPassengers+300])\n",
    "\n",
    "AirPassengersPanel = pd.DataFrame({'unique_id': unique_id, 'ds': ds, 'y': y})\n",
    "\n",
    "# For future exogenous variables\n",
    "# Declare SeasonalNaive12 and fill first 12 values with y\n",
    "snaive = AirPassengersPanel.groupby('unique_id')['y'].shift(periods=12).reset_index(drop=True)\n",
    "AirPassengersPanel['trend'] = range(len(AirPassengersPanel))\n",
    "AirPassengersPanel['y_[lag12]'] = snaive.fillna(AirPassengersPanel['y'])\n",
    "\n",
    "# Declare Static Data\n",
    "unique_id = np.array(['Airline1', 'Airline2'])\n",
    "airline1_dummy = [0, 1]\n",
    "airline2_dummy = [1, 0]\n",
    "AirPassengersStatic = pd.DataFrame({'unique_id': unique_id,\n",
    "                                    'airline1': airline1_dummy,\n",
    "                                    'airline2': airline2_dummy})\n",
    "\n",
    "AirPassengersPanel.groupby('unique_id').tail(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(1, 1, figsize = (20, 7))\n",
    "plot_df = AirPassengersPanel.set_index('ds')\n",
    "\n",
    "plot_df.groupby('unique_id')['y'].plot(legend=True)\n",
    "ax.set_title('AirPassengers Panel Data', fontsize=22)\n",
    "ax.set_ylabel('Monthly Passengers', fontsize=20)\n",
    "ax.set_xlabel('Timestamp [t]', fontsize=20)\n",
    "ax.legend(title='unique_id', prop={'size': 15})\n",
    "ax.grid()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(1, 1, figsize = (20, 7))\n",
    "plot_df = AirPassengersPanel[AirPassengersPanel.unique_id=='Airline1'].set_index('ds')\n",
    "\n",
    "plot_df[['y', 'trend', 'y_[lag12]']].plot(ax=ax, linewidth=2)\n",
    "ax.set_title('Box-Cox AirPassengers Data', fontsize=22)\n",
    "ax.set_ylabel('Monthly Passengers', fontsize=20)\n",
    "ax.set_xlabel('Timestamp [t]', fontsize=20)\n",
    "ax.legend(prop={'size': 15})\n",
    "ax.grid()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4. Time Features"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have developed a utility that generates normalized calendar features for use as absolute positional embeddings in Transformer-based models. These embeddings capture seasonal patterns in time series data and can be easily incorporated into the model architecture. Additionally, the features can be used as exogenous variables in other models to inform them of calendar patterns in the data."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**References**<br>\n",
    "- [Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, Wancai Zhang. \"Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting\"](https://arxiv.org/abs/2012.07436)<br>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "class TimeFeature:\n",
    "    def __init__(self):\n",
    "        pass\n",
    "\n",
    "    def __call__(self, index: pd.DatetimeIndex):\n",
    "        return print('Overwrite with corresponding feature')\n",
    "\n",
    "    def __repr__(self):\n",
    "        return self.__class__.__name__ + \"()\"\n",
    "\n",
    "class SecondOfMinute(TimeFeature):\n",
    "    \"\"\"Minute of hour encoded as value between [-0.5, 0.5]\"\"\"\n",
    "    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:\n",
    "        return index.second / 59.0 - 0.5\n",
    "\n",
    "class MinuteOfHour(TimeFeature):\n",
    "    \"\"\"Minute of hour encoded as value between [-0.5, 0.5]\"\"\"\n",
    "    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:\n",
    "        return index.minute / 59.0 - 0.5\n",
    "\n",
    "class HourOfDay(TimeFeature):\n",
    "    \"\"\"Hour of day encoded as value between [-0.5, 0.5]\"\"\"\n",
    "    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:\n",
    "        return index.hour / 23.0 - 0.5\n",
    "\n",
    "class DayOfWeek(TimeFeature):\n",
    "    \"\"\"Hour of day encoded as value between [-0.5, 0.5]\"\"\"\n",
    "    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:\n",
    "        return index.dayofweek / 6.0 - 0.5\n",
    "\n",
    "class DayOfMonth(TimeFeature):\n",
    "    \"\"\"Day of month encoded as value between [-0.5, 0.5]\"\"\"\n",
    "    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:\n",
    "        return (index.day - 1) / 30.0 - 0.5\n",
    "\n",
    "class DayOfYear(TimeFeature):\n",
    "    \"\"\"Day of year encoded as value between [-0.5, 0.5]\"\"\"\n",
    "    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:\n",
    "        return (index.dayofyear - 1) / 365.0 - 0.5\n",
    "\n",
    "class MonthOfYear(TimeFeature):\n",
    "    \"\"\"Month of year encoded as value between [-0.5, 0.5]\"\"\"\n",
    "    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:\n",
    "        return (index.month - 1) / 11.0 - 0.5\n",
    "\n",
    "class WeekOfYear(TimeFeature):\n",
    "    \"\"\"Week of year encoded as value between [-0.5, 0.5]\"\"\"\n",
    "    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:\n",
    "        return (index.week - 1) / 52.0 - 0.5\n",
    "\n",
    "def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:\n",
    "    \"\"\"\n",
    "    Returns a list of time features that will be appropriate for the given frequency string.\n",
    "    Parameters\n",
    "    ----------\n",
    "    freq_str\n",
    "        Frequency string of the form [multiple][granularity] such as \"12H\", \"5min\", \"1D\" etc.\n",
    "    \"\"\"\n",
    "\n",
    "    if freq_str not in ['Q', 'M', 'MS', 'W', 'D', 'B', 'H', 'T', 'S']:\n",
    "        raise Exception('Frequency not supported')\n",
    "    \n",
    "    if freq_str in ['Q','M', 'MS']:\n",
    "        return [cls() for cls in [MonthOfYear]]\n",
    "    elif freq_str == 'W':\n",
    "        return [cls() for cls in [DayOfMonth, WeekOfYear]]\n",
    "    elif freq_str in ['D','B']:\n",
    "        return [cls() for cls in [DayOfWeek, DayOfMonth, DayOfYear]]\n",
    "    elif freq_str == 'H':\n",
    "        return [cls() for cls in [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]]\n",
    "    elif freq_str == 'T':\n",
    "        return [cls() for cls in [MinuteOfHour, HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]]\n",
    "    else:\n",
    "        return [cls() for cls in [SecondOfMinute, MinuteOfHour, HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]]\n",
    "\n",
    "def augment_calendar_df(df, freq='H'):\n",
    "    \"\"\"\n",
    "    > * Q - [month]\n",
    "    > * M - [month]\n",
    "    > * W - [Day of month, week of year]\n",
    "    > * D - [Day of week, day of month, day of year]\n",
    "    > * B - [Day of week, day of month, day of year]\n",
    "    > * H - [Hour of day, day of week, day of month, day of year]\n",
    "    > * T - [Minute of hour*, hour of day, day of week, day of month, day of year]\n",
    "    > * S - [Second of minute, minute of hour, hour of day, day of week, day of month, day of year]\n",
    "    *minute returns a number from 0-3 corresponding to the 15 minute period it falls into.\n",
    "    \"\"\"\n",
    "    df = df.copy()\n",
    "\n",
    "    freq_map = {\n",
    "        'Q':['month'],\n",
    "        'M':['month'],\n",
    "        'MS':['month'],\n",
    "        'W':['monthday', 'yearweek'],\n",
    "        'D':['weekday','monthday','yearday'],\n",
    "        'B':['weekday','monthday','yearday'],\n",
    "        'H':['dayhour','weekday','monthday','yearday'],\n",
    "        'T':['hourminute','dayhour','weekday','monthday','yearday'],\n",
    "        'S':['minutesecond','hourminute','dayhour','weekday','monthday','yearday']\n",
    "    }\n",
    "\n",
    "    ds_col = pd.to_datetime(df.ds.values)\n",
    "    ds_data = np.vstack([feat(ds_col) for feat in time_features_from_frequency_str(freq)]).transpose(1,0)\n",
    "    ds_data = pd.DataFrame(ds_data, columns=freq_map[freq])\n",
    "    \n",
    "    return pd.concat([df, ds_data], axis=1), freq_map[freq]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "AirPassengerPanelCalendar, calendar_cols = augment_calendar_df(df=AirPassengersPanel, freq='M')\n",
    "AirPassengerPanelCalendar.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_df = AirPassengerPanelCalendar[AirPassengerPanelCalendar.unique_id=='Airline1'].set_index('ds')\n",
    "plt.plot(plot_df['month'])\n",
    "plt.grid()\n",
    "plt.xlabel('Datestamp')\n",
    "plt.ylabel('Normalized Month')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#| export\n",
    "def get_indexer_raise_missing(idx: pd.Index, vals: List[str]) -> List[int]:\n",
    "    idxs = idx.get_indexer(vals)\n",
    "    missing = [v for i, v in zip(idxs, vals) if i == -1]\n",
    "    if missing:\n",
    "        raise ValueError(f'The following values are missing from the index: {missing}')\n",
    "    return idxs"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}