# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/utils.ipynb. # %% auto 0 __all__ = ['AirPassengers', 'AirPassengersDF', 'unique_id', 'ds', 'y', 'AirPassengersPanel', 'snaive', 'airline1_dummy', 'airline2_dummy', 'AirPassengersStatic', 'generate_series', 'TimeFeature', 'SecondOfMinute', 'MinuteOfHour', 'HourOfDay', 'DayOfWeek', 'DayOfMonth', 'DayOfYear', 'MonthOfYear', 'WeekOfYear', 'time_features_from_frequency_str', 'augment_calendar_df', 'get_indexer_raise_missing'] # %% ../nbs/utils.ipynb 3 import random from itertools import chain from typing import List import numpy as np import pandas as pd # %% ../nbs/utils.ipynb 6 def generate_series( n_series: int, freq: str = "D", min_length: int = 50, max_length: int = 500, n_temporal_features: int = 0, n_static_features: int = 0, equal_ends: bool = False, seed: int = 0, ) -> pd.DataFrame: """Generate Synthetic Panel Series. Generates `n_series` of frequency `freq` of different lengths in the interval [`min_length`, `max_length`]. If `n_temporal_features > 0`, then each serie gets temporal features with random values. If `n_static_features > 0`, then a static dataframe is returned along the temporal dataframe. If `equal_ends == True` then all series end at the same date. **Parameters:**
`n_series`: int, number of series for synthetic panel.
`min_length`: int, minimal length of synthetic panel's series.
`max_length`: int, minimal length of synthetic panel's series.
`n_temporal_features`: int, default=0, number of temporal exogenous variables for synthetic panel's series.
`n_static_features`: int, default=0, number of static exogenous variables for synthetic panel's series.
`equal_ends`: bool, if True, series finish in the same date stamp `ds`.
`freq`: str, frequency of the data, [panda's available frequencies](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases).
**Returns:**
`freq`: pandas.DataFrame, synthetic panel with columns [`unique_id`, `ds`, `y`] and exogenous. """ seasonalities = {"D": 7, "M": 12} season = seasonalities[freq] rng = np.random.RandomState(seed) series_lengths = rng.randint(min_length, max_length + 1, n_series) total_length = series_lengths.sum() dates = pd.date_range("2000-01-01", periods=max_length, freq=freq).values uids = [np.repeat(i, serie_length) for i, serie_length in enumerate(series_lengths)] if equal_ends: ds = [dates[-serie_length:] for serie_length in series_lengths] else: ds = [dates[:serie_length] for serie_length in series_lengths] y = np.arange(total_length) % season + rng.rand(total_length) * 0.5 temporal_df = pd.DataFrame( dict(unique_id=chain.from_iterable(uids), ds=chain.from_iterable(ds), y=y) ) random.seed(seed) for i in range(n_temporal_features): random.seed(seed) temporal_values = [ [random.randint(0, 100)] * serie_length for serie_length in series_lengths ] temporal_df[f"temporal_{i}"] = np.hstack(temporal_values) temporal_df[f"temporal_{i}"] = temporal_df[f"temporal_{i}"].astype("category") if i == 0: temporal_df["y"] = temporal_df["y"] * ( 1 + temporal_df[f"temporal_{i}"].cat.codes ) temporal_df["unique_id"] = temporal_df["unique_id"].astype("category") temporal_df["unique_id"] = temporal_df["unique_id"].cat.as_ordered() if n_static_features > 0: static_features = np.random.uniform( low=0.0, high=1.0, size=(n_series, n_static_features) ) static_df = pd.DataFrame.from_records( static_features, columns=[f"static_{i}" for i in range(n_static_features)] ) static_df["unique_id"] = np.arange(n_series) static_df["unique_id"] = static_df["unique_id"].astype("category") static_df["unique_id"] = static_df["unique_id"].cat.as_ordered() return temporal_df, static_df return temporal_df # %% ../nbs/utils.ipynb 11 AirPassengers = np.array( [ 112.0, 118.0, 132.0, 129.0, 121.0, 135.0, 148.0, 148.0, 136.0, 119.0, 104.0, 118.0, 115.0, 126.0, 141.0, 135.0, 125.0, 149.0, 170.0, 170.0, 158.0, 133.0, 114.0, 140.0, 145.0, 150.0, 178.0, 163.0, 172.0, 178.0, 199.0, 199.0, 184.0, 162.0, 146.0, 166.0, 171.0, 180.0, 193.0, 181.0, 183.0, 218.0, 230.0, 242.0, 209.0, 191.0, 172.0, 194.0, 196.0, 196.0, 236.0, 235.0, 229.0, 243.0, 264.0, 272.0, 237.0, 211.0, 180.0, 201.0, 204.0, 188.0, 235.0, 227.0, 234.0, 264.0, 302.0, 293.0, 259.0, 229.0, 203.0, 229.0, 242.0, 233.0, 267.0, 269.0, 270.0, 315.0, 364.0, 347.0, 312.0, 274.0, 237.0, 278.0, 284.0, 277.0, 317.0, 313.0, 318.0, 374.0, 413.0, 405.0, 355.0, 306.0, 271.0, 306.0, 315.0, 301.0, 356.0, 348.0, 355.0, 422.0, 465.0, 467.0, 404.0, 347.0, 305.0, 336.0, 340.0, 318.0, 362.0, 348.0, 363.0, 435.0, 491.0, 505.0, 404.0, 359.0, 310.0, 337.0, 360.0, 342.0, 406.0, 396.0, 420.0, 472.0, 548.0, 559.0, 463.0, 407.0, 362.0, 405.0, 417.0, 391.0, 419.0, 461.0, 472.0, 535.0, 622.0, 606.0, 508.0, 461.0, 390.0, 432.0, ], dtype=np.float32, ) # %% ../nbs/utils.ipynb 12 AirPassengersDF = pd.DataFrame( { "unique_id": np.ones(len(AirPassengers)), "ds": pd.date_range( start="1949-01-01", periods=len(AirPassengers), freq=pd.offsets.MonthEnd() ), "y": AirPassengers, } ) # %% ../nbs/utils.ipynb 19 # Declare Panel Data unique_id = np.concatenate( [["Airline1"] * len(AirPassengers), ["Airline2"] * len(AirPassengers)] ) ds = np.tile( pd.date_range( start="1949-01-01", periods=len(AirPassengers), freq=pd.offsets.MonthEnd() ).to_numpy(), 2, ) y = np.concatenate([AirPassengers, AirPassengers + 300]) AirPassengersPanel = pd.DataFrame({"unique_id": unique_id, "ds": ds, "y": y}) # For future exogenous variables # Declare SeasonalNaive12 and fill first 12 values with y snaive = ( AirPassengersPanel.groupby("unique_id")["y"] .shift(periods=12) .reset_index(drop=True) ) AirPassengersPanel["trend"] = range(len(AirPassengersPanel)) AirPassengersPanel["y_[lag12]"] = snaive.fillna(AirPassengersPanel["y"]) # Declare Static Data unique_id = np.array(["Airline1", "Airline2"]) airline1_dummy = [0, 1] airline2_dummy = [1, 0] AirPassengersStatic = pd.DataFrame( {"unique_id": unique_id, "airline1": airline1_dummy, "airline2": airline2_dummy} ) AirPassengersPanel.groupby("unique_id").tail(4) # %% ../nbs/utils.ipynb 25 class TimeFeature: def __init__(self): pass def __call__(self, index: pd.DatetimeIndex): return print("Overwrite with corresponding feature") def __repr__(self): return self.__class__.__name__ + "()" class SecondOfMinute(TimeFeature): """Minute of hour encoded as value between [-0.5, 0.5]""" def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: return index.second / 59.0 - 0.5 class MinuteOfHour(TimeFeature): """Minute of hour encoded as value between [-0.5, 0.5]""" def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: return index.minute / 59.0 - 0.5 class HourOfDay(TimeFeature): """Hour of day encoded as value between [-0.5, 0.5]""" def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: return index.hour / 23.0 - 0.5 class DayOfWeek(TimeFeature): """Hour of day encoded as value between [-0.5, 0.5]""" def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: return index.dayofweek / 6.0 - 0.5 class DayOfMonth(TimeFeature): """Day of month encoded as value between [-0.5, 0.5]""" def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: return (index.day - 1) / 30.0 - 0.5 class DayOfYear(TimeFeature): """Day of year encoded as value between [-0.5, 0.5]""" def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: return (index.dayofyear - 1) / 365.0 - 0.5 class MonthOfYear(TimeFeature): """Month of year encoded as value between [-0.5, 0.5]""" def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: return (index.month - 1) / 11.0 - 0.5 class WeekOfYear(TimeFeature): """Week of year encoded as value between [-0.5, 0.5]""" def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: return (index.week - 1) / 52.0 - 0.5 def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]: """ Returns a list of time features that will be appropriate for the given frequency string. Parameters ---------- freq_str Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc. """ if freq_str not in ["Q", "M", "MS", "W", "D", "B", "H", "T", "S"]: raise Exception("Frequency not supported") if freq_str in ["Q", "M", "MS"]: return [cls() for cls in [MonthOfYear]] elif freq_str == "W": return [cls() for cls in [DayOfMonth, WeekOfYear]] elif freq_str in ["D", "B"]: return [cls() for cls in [DayOfWeek, DayOfMonth, DayOfYear]] elif freq_str == "H": return [cls() for cls in [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]] elif freq_str == "T": return [ cls() for cls in [MinuteOfHour, HourOfDay, DayOfWeek, DayOfMonth, DayOfYear] ] else: return [ cls() for cls in [ SecondOfMinute, MinuteOfHour, HourOfDay, DayOfWeek, DayOfMonth, DayOfYear, ] ] def augment_calendar_df(df, freq="H"): """ > * Q - [month] > * M - [month] > * W - [Day of month, week of year] > * D - [Day of week, day of month, day of year] > * B - [Day of week, day of month, day of year] > * H - [Hour of day, day of week, day of month, day of year] > * T - [Minute of hour*, hour of day, day of week, day of month, day of year] > * S - [Second of minute, minute of hour, hour of day, day of week, day of month, day of year] *minute returns a number from 0-3 corresponding to the 15 minute period it falls into. """ df = df.copy() freq_map = { "Q": ["month"], "M": ["month"], "MS": ["month"], "W": ["monthday", "yearweek"], "D": ["weekday", "monthday", "yearday"], "B": ["weekday", "monthday", "yearday"], "H": ["dayhour", "weekday", "monthday", "yearday"], "T": ["hourminute", "dayhour", "weekday", "monthday", "yearday"], "S": [ "minutesecond", "hourminute", "dayhour", "weekday", "monthday", "yearday", ], } ds_col = pd.to_datetime(df.ds.values) ds_data = np.vstack( [feat(ds_col) for feat in time_features_from_frequency_str(freq)] ).transpose(1, 0) ds_data = pd.DataFrame(ds_data, columns=freq_map[freq]) return pd.concat([df, ds_data], axis=1), freq_map[freq] # %% ../nbs/utils.ipynb 28 def get_indexer_raise_missing(idx: pd.Index, vals: List[str]) -> List[int]: idxs = idx.get_indexer(vals) missing = [v for i, v in zip(idxs, vals) if i == -1] if missing: raise ValueError(f"The following values are missing from the index: {missing}") return idxs