Commit f42429f6 authored by bailuo's avatar bailuo
Browse files

readme

parents
import os
import pandas as pd
from neuralforecast import NeuralForecast
from neuralforecast.auto import (
AutoNHITS as _AutoNHITS,
AutoTFT as _AutoTFT,
)
from neuralforecast.common._base_model import BaseModel as NeuralForecastModel
from ray import tune
from ..utils.forecaster import Forecaster
os.environ["NIXTLA_ID_AS_COL"] = "true"
def run_neuralforecast_model(
model: NeuralForecastModel,
df: pd.DataFrame,
freq: str,
) -> pd.DataFrame:
nf = NeuralForecast(
models=[model],
freq=freq,
)
nf.fit(df=df)
fcst_df = nf.predict()
return fcst_df
class AutoNHITS(Forecaster):
def __init__(
self,
alias: str = "AutoNHITS",
num_samples: int = 10,
backend: str = "optuna",
):
self.alias = alias
self.num_samples = num_samples
self.backend = backend
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
config = _AutoNHITS.get_default_config(h=h, backend="ray")
config["scaler_type"] = tune.choice(["robust"])
if self.backend == "optuna":
config = _AutoNHITS._ray_config_to_optuna(config)
fcst_df = run_neuralforecast_model(
model=_AutoNHITS(
h=h,
alias=self.alias,
num_samples=self.num_samples,
backend=self.backend,
config=config,
),
df=df,
freq=freq,
)
return fcst_df
class AutoTFT(Forecaster):
def __init__(
self,
alias: str = "AutoTFT",
num_samples: int = 10,
backend: str = "optuna",
):
self.alias = alias
self.num_samples = num_samples
self.backend = backend
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
config = _AutoTFT.get_default_config(h=h, backend="ray")
config["scaler_type"] = tune.choice(["robust"])
if self.backend == "optuna":
config = _AutoTFT._ray_config_to_optuna(config)
fcst_df = run_neuralforecast_model(
model=_AutoTFT(
h=h,
alias=self.alias,
num_samples=self.num_samples,
backend=self.backend,
config=config,
),
df=df,
freq=freq,
)
return fcst_df
from copy import deepcopy
from typing import List
from threadpoolctl import threadpool_limits
import pandas as pd
from prophet import Prophet
from ..utils.parallel_forecaster import ParallelForecaster
from ..utils.forecaster import Forecaster
class NixtlaProphet(Prophet, ParallelForecaster, Forecaster):
def __init__(
self,
alias: str = "Prophet",
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.alias = alias
def __local_forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
quantiles: List[float] | None = None,
) -> pd.DataFrame:
if quantiles is not None:
raise NotImplementedError
model = deepcopy(self)
model.fit(df=df)
future_df = model.make_future_dataframe(
periods=h,
include_history=False,
freq=freq,
)
fcst_df = model.predict(future_df)
fcst_df = fcst_df.rename({"yhat": self.alias}, axis=1)
fcst_df = fcst_df[["ds", self.alias]]
return fcst_df
def _local_forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
quantiles: List[float] | None = None,
) -> pd.DataFrame:
with threadpool_limits(limits=1):
return self.__local_forecast(
df=df,
h=h,
freq=freq,
quantiles=quantiles,
)
import os
import pandas as pd
from statsforecast import StatsForecast
from statsforecast.models import (
_TS as StatsForecastModel,
ADIDA as _ADIDA,
AutoARIMA as _AutoARIMA,
AutoCES as _AutoCES,
AutoETS as _AutoETS,
CrostonClassic as _CrostonClassic,
DynamicOptimizedTheta as _DOTheta,
HistoricAverage as _HistoricAverage,
IMAPA as _IMAPA,
SeasonalNaive as _SeasonalNaive,
Theta as _Theta,
ZeroModel as _ZeroModel,
)
from ..utils.forecaster import Forecaster, get_seasonality
os.environ["NIXTLA_ID_AS_COL"] = "true"
def run_statsforecast_model(
model: StatsForecastModel,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
sf = StatsForecast(
models=[model],
freq=freq,
n_jobs=-1,
fallback_model=_SeasonalNaive(
season_length=get_seasonality(freq),
),
)
fcst_df = sf.forecast(df=df, h=h)
return fcst_df
class ADIDA(Forecaster):
def __init__(
self,
alias: str = "ADIDA",
):
self.alias = alias
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
fcst_df = run_statsforecast_model(
model=_ADIDA(alias=self.alias),
df=df,
h=h,
freq=freq,
)
return fcst_df
class AutoARIMA(Forecaster):
def __init__(
self,
alias: str = "AutoARIMA",
):
self.alias = alias
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
seasonality = get_seasonality(freq)
fcst_df = run_statsforecast_model(
model=_AutoARIMA(season_length=seasonality, alias=self.alias),
df=df,
h=h,
freq=freq,
)
return fcst_df
class AutoCES(Forecaster):
def __init__(
self,
alias: str = "AutoCES",
):
self.alias = alias
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
seasonality = get_seasonality(freq)
fcst_df = run_statsforecast_model(
model=_AutoCES(season_length=seasonality, alias=self.alias),
df=df,
h=h,
freq=freq,
)
return fcst_df
class AutoETS(Forecaster):
def __init__(
self,
alias: str = "AutoETS",
):
self.alias = alias
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
seasonality = get_seasonality(freq)
fcst_df = run_statsforecast_model(
model=_AutoETS(season_length=seasonality, alias=self.alias),
df=df,
h=h,
freq=freq,
)
return fcst_df
class CrostonClassic(Forecaster):
def __init__(
self,
alias: str = "CrostonClassic",
):
self.alias = alias
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
fcst_df = run_statsforecast_model(
model=_CrostonClassic(alias=self.alias),
df=df,
h=h,
freq=freq,
)
return fcst_df
class DOTheta(Forecaster):
def __init__(
self,
alias: str = "DOTheta",
):
self.alias = alias
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
seasonality = get_seasonality(freq)
fcst_df = run_statsforecast_model(
model=_DOTheta(season_length=seasonality, alias=self.alias),
df=df,
h=h,
freq=freq,
)
return fcst_df
class HistoricAverage(Forecaster):
def __init__(
self,
alias: str = "HistoricAverage",
):
self.alias = alias
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
fcst_df = run_statsforecast_model(
model=_HistoricAverage(alias=self.alias),
df=df,
h=h,
freq=freq,
)
return fcst_df
class IMAPA(Forecaster):
def __init__(
self,
alias: str = "IMAPA",
):
self.alias = alias
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
fcst_df = run_statsforecast_model(
model=_IMAPA(alias=self.alias),
df=df,
h=h,
freq=freq,
)
return fcst_df
class SeasonalNaive(Forecaster):
def __init__(
self,
alias: str = "SeasonalNaive",
):
self.alias = alias
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
seasonality = get_seasonality(freq)
fcst_df = run_statsforecast_model(
model=_SeasonalNaive(season_length=seasonality, alias=self.alias),
df=df,
h=h,
freq=freq,
)
return fcst_df
class Theta(Forecaster):
def __init__(
self,
alias: str = "Theta",
):
self.alias = alias
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
seasonality = get_seasonality(freq)
fcst_df = run_statsforecast_model(
model=_Theta(season_length=seasonality, alias=self.alias),
df=df,
h=h,
freq=freq,
)
return fcst_df
class ZeroModel(Forecaster):
def __init__(
self,
alias: str = "ZeroModel",
):
self.alias = alias
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
fcst_df = run_statsforecast_model(
model=_ZeroModel(alias=self.alias),
df=df,
h=h,
freq=freq,
)
return fcst_df
from .chronos import Chronos
from .lagllama import LagLlama
from .moirai import Moirai
from .timegpt import TimeGPT
from .timesfm import TimesFM
__all__ = [
"Chronos",
"LagLlama",
"Moirai",
"TimeGPT",
"TimesFM",
]
from typing import Iterable, List
import numpy as np
import pandas as pd
import torch
from chronos import ChronosPipeline
from tqdm import tqdm
from utilsforecast.processing import make_future_dataframe
from ..utils.forecaster import Forecaster
class TimeSeriesDataset:
def __init__(
self,
data: torch.Tensor,
uids: Iterable,
last_times: Iterable,
batch_size: int,
):
self.data = data
self.uids = uids
self.last_times = last_times
self.batch_size = batch_size
self.n_batches = len(data) // self.batch_size + (
0 if len(data) % self.batch_size == 0 else 1
)
self.current_batch = 0
@classmethod
def from_df(cls, df: pd.DataFrame, batch_size: int):
num_unique_ids = df["unique_id"].nunique()
max_series_length = df["unique_id"].value_counts().max()
padded_tensor = torch.full(
size=(num_unique_ids, max_series_length),
fill_value=torch.nan,
dtype=torch.bfloat16,
) # type: ignore
df_sorted = df.sort_values(by=["unique_id", "ds"])
for idx, (_, group) in enumerate(df_sorted.groupby("unique_id")):
series_length = len(group)
padded_tensor[idx, -series_length:] = torch.tensor(
group["y"].values,
dtype=torch.bfloat16,
)
uids = df_sorted["unique_id"].unique()
last_times = df_sorted.groupby("unique_id")["ds"].tail(1)
return cls(padded_tensor, uids, last_times, batch_size)
def __len__(self):
return self.n_batches
def make_future_dataframe(self, h: int, freq: str) -> pd.DataFrame:
return make_future_dataframe(
uids=self.uids,
last_times=pd.to_datetime(self.last_times),
h=h,
freq=freq,
) # type: ignore
def __iter__(self):
self.current_batch = 0 # Reset for new iteration
return self
def __next__(self):
if self.current_batch < self.n_batches:
start_idx = self.current_batch * self.batch_size
end_idx = start_idx + self.batch_size
self.current_batch += 1
return self.data[start_idx:end_idx]
else:
raise StopIteration
class Chronos(Forecaster):
def __init__(
self,
repo_id: str = "amazon/chronos-t5-large",
batch_size: int = 16,
alias: str = "Chronos",
):
self.repo_id = repo_id
self.batch_size = batch_size
self.alias = alias
self.model = ChronosPipeline.from_pretrained(
repo_id,
device_map="auto",
torch_dtype=torch.bfloat16,
)
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
dataset = TimeSeriesDataset.from_df(df, batch_size=self.batch_size)
fcsts = [
self.model.predict(batch, prediction_length=h) for batch in tqdm(dataset)
]
fcst = torch.cat(fcsts)
fcst = fcst.numpy()
fcst_df = dataset.make_future_dataframe(h=h, freq=freq)
fcst_df[self.alias] = np.mean(fcst, axis=1).reshape(-1, 1)
return fcst_df
from gluonts.torch.model.predictor import PyTorchPredictor
from lag_llama.gluon.estimator import LagLlamaEstimator
from ..utils.gluonts_forecaster import GluonTSForecaster
class LagLlama(GluonTSForecaster):
def __init__(
self,
repo_id: str = "time-series-foundation-models/Lag-Llama",
filename: str = "lag-llama.ckpt",
alias: str = "LagLlama",
):
super().__init__(
repo_id=repo_id,
filename=filename,
alias=alias,
)
def get_predictor(self, prediction_length: int) -> PyTorchPredictor:
ckpt = self.load()
estimator_args = ckpt["hyper_parameters"]["model_kwargs"]
# this context length is reported in the paper
context_length = 32
estimator = LagLlamaEstimator(
ckpt_path=self.checkpoint_path,
prediction_length=prediction_length,
context_length=context_length,
# estimator args
input_size=estimator_args["input_size"],
n_layer=estimator_args["n_layer"],
n_embd_per_head=estimator_args["n_embd_per_head"],
n_head=estimator_args["n_head"],
scaling=estimator_args["scaling"],
time_feat=estimator_args["time_feat"],
)
lightning_module = estimator.create_lightning_module()
transformation = estimator.create_transformation()
predictor = estimator.create_predictor(transformation, lightning_module)
return predictor
from gluonts.torch.model.predictor import PyTorchPredictor
from uni2ts.model.moirai import MoiraiForecast, MoiraiModule
from ..utils.gluonts_forecaster import GluonTSForecaster
class Moirai(GluonTSForecaster):
def __init__(
self,
repo_id: str = "Salesforce/moirai-1.0-R-large",
filename: str = "model.ckpt",
alias: str = "Moirai",
):
super().__init__(
repo_id=repo_id,
filename=filename,
alias=alias,
)
def get_predictor(self, prediction_length: int) -> PyTorchPredictor:
model = MoiraiForecast(
module=MoiraiModule.from_pretrained(self.repo_id),
prediction_length=prediction_length,
context_length=200,
patch_size="auto",
num_samples=100,
target_dim=1,
feat_dynamic_real_dim=0,
past_feat_dynamic_real_dim=0,
)
predictor = model.create_predictor(batch_size=32)
return predictor
import os
import pandas as pd
from dotenv import load_dotenv
from nixtla import NixtlaClient
from typing import Optional
from ..utils.forecaster import Forecaster
load_dotenv()
class TimeGPT(Forecaster):
def __init__(
self,
api_key: str | None = None,
base_url: Optional[str] = None,
max_retries: int = 1,
model: str = "timegpt-1",
alias: str = "TimeGPT",
):
self.api_key = api_key
self.base_url = base_url
self.max_retries = max_retries
self.model = model
self.alias = alias
def _get_client(self) -> NixtlaClient:
if self.api_key is None:
api_key = os.environ["NIXTLA_API_KEY"]
else:
api_key = self.api_key
return NixtlaClient(
api_key=api_key,
base_url=self.base_url,
max_retries=self.max_retries,
)
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
client = self._get_client()
fcst_df = client.forecast(
df=df,
h=h,
freq=freq,
model=self.model,
)
fcst_df["ds"] = pd.to_datetime(fcst_df["ds"])
fcst_df = fcst_df.rename(columns={"TimeGPT": self.alias})
return fcst_df
import pandas as pd
import timesfm
import torch
from paxml import checkpoints
from ..utils.forecaster import Forecaster
class TimesFM(Forecaster):
def __init__(
self,
repo_id: str = "google/timesfm-1.0-200m",
context_length: int = 512,
batch_size: int = 64,
alias: str = "TimesFM",
):
self.repo_id = repo_id
self.context_length = context_length
self.batch_size = batch_size
self.alias = alias
def get_predictor(
self,
prediction_length: int,
) -> timesfm.TimesFm:
backend = "gpu" if torch.cuda.is_available() else "cpu"
tfm = timesfm.TimesFm(
context_len=self.context_length,
horizon_len=prediction_length,
input_patch_len=32,
output_patch_len=128,
num_layers=20,
model_dims=1280,
backend=backend,
per_core_batch_size=self.batch_size,
)
tfm.load_from_checkpoint(repo_id=self.repo_id)
return tfm
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
predictor = self.get_predictor(prediction_length=h)
fcst_df = predictor.forecast_on_df(
inputs=df,
freq=freq,
value_name="y",
model_name=self.alias,
num_jobs=1,
)
fcst_df = fcst_df[["unique_id", "ds", self.alias]]
return fcst_df
from typing import List
import pandas as pd
from gluonts.time_feature.seasonality import get_seasonality as _get_seasonality
from tqdm import tqdm
from utilsforecast.processing import (
backtest_splits,
drop_index_if_pandas,
join,
maybe_compute_sort_indices,
take_rows,
vertical_concat,
)
def get_seasonality(freq: str) -> int:
return _get_seasonality(freq, seasonalities={"D": 7})
def maybe_convert_col_to_datetime(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
if not pd.api.types.is_datetime64_any_dtype(df[col_name]):
df = df.copy()
df[col_name] = pd.to_datetime(df[col_name])
return df
class Forecaster:
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
raise NotImplementedError
def cross_validation(
self,
df: pd.DataFrame,
h: int,
freq: str,
n_windows: int = 1,
step_size: int | None = None,
) -> pd.DataFrame:
df = maybe_convert_col_to_datetime(df, "ds")
# mlforecast cv code
results = []
sort_idxs = maybe_compute_sort_indices(df, "unique_id", "ds")
if sort_idxs is not None:
df = take_rows(df, sort_idxs)
splits = backtest_splits(
df,
n_windows=n_windows,
h=h,
id_col="unique_id",
time_col="ds",
freq=pd.tseries.frequencies.to_offset(freq),
step_size=h if step_size is None else step_size,
)
for _, (cutoffs, train, valid) in tqdm(enumerate(splits)):
if len(valid.columns) > 3:
raise NotImplementedError(
"Cross validation with exogenous variables is not yet supported."
)
y_pred = self.forecast(
df=train,
h=h,
freq=freq,
)
y_pred = join(y_pred, cutoffs, on="unique_id", how="left")
result = join(
valid[["unique_id", "ds", "y"]],
y_pred,
on=["unique_id", "ds"],
)
if result.shape[0] < valid.shape[0]:
raise ValueError(
"Cross validation result produced less results than expected. "
"Please verify that the frequency parameter (freq) matches your series' "
"and that there aren't any missing periods."
)
results.append(result)
out = vertical_concat(results)
out = drop_index_if_pandas(out)
first_out_cols = ["unique_id", "ds", "cutoff", "y"]
remaining_cols = [c for c in out.columns if c not in first_out_cols]
fcst_cv_df = out[first_out_cols + remaining_cols]
return fcst_cv_df
from typing import Iterable, List, Any
import pandas as pd
import torch
from gluonts.dataset.pandas import PandasDataset
from gluonts.model.forecast import Forecast
from gluonts.torch.model.predictor import PyTorchPredictor
from huggingface_hub import hf_hub_download
from tqdm import tqdm
from .forecaster import Forecaster
def fix_freq(freq: str) -> str:
# see https://github.com/awslabs/gluonts/pull/2462/files
if len(freq) > 1 and freq.endswith("S"):
return freq[:-1]
return freq
def maybe_convert_col_to_float32(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
if df[col_name].dtype != "float32":
df = df.copy()
df[col_name] = df[col_name].astype("float32")
return df
class GluonTSForecaster(Forecaster):
def __init__(self, repo_id: str, filename: str, alias: str):
self.repo_id = repo_id
self.filename = filename
self.alias = alias
@property
def checkpoint_path(self) -> str:
return hf_hub_download(
repo_id=self.repo_id,
filename=self.filename,
)
@property
def map_location(self) -> str:
map_location = "cuda:0" if torch.cuda.is_available() else "cpu"
return map_location
def load(self) -> Any:
return torch.load(
self.checkpoint_path,
map_location=self.map_location,
)
def get_predictor(self, prediction_length: int) -> PyTorchPredictor:
raise NotImplementedError
def gluonts_instance_fcst_to_df(
self,
fcst: Forecast,
freq: str,
model_name: str,
) -> pd.DataFrame:
point_forecast = fcst.mean
h = len(point_forecast)
dates = pd.date_range(
fcst.start_date.to_timestamp(),
freq=freq,
periods=h,
)
fcst_df = pd.DataFrame(
{
"ds": dates,
"unique_id": fcst.item_id,
model_name: point_forecast,
}
)
return fcst_df
def gluonts_fcsts_to_df(
self,
fcsts: Iterable[Forecast],
freq: str,
model_name: str,
) -> pd.DataFrame:
df = []
for fcst in tqdm(fcsts):
fcst_df = self.gluonts_instance_fcst_to_df(
fcst=fcst,
freq=freq,
model_name=model_name,
)
df.append(fcst_df)
return pd.concat(df).reset_index(drop=True)
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
) -> pd.DataFrame:
df = maybe_convert_col_to_float32(df, "y")
gluonts_dataset = PandasDataset.from_long_dataframe(
df,
target="y",
item_id="unique_id",
timestamp="ds",
freq=fix_freq(freq),
)
predictor = self.get_predictor(prediction_length=h)
fcsts = predictor.predict(gluonts_dataset, num_samples=100)
fcst_df = self.gluonts_fcsts_to_df(
fcsts,
freq=freq,
model_name=self.alias,
)
return fcst_df
import os
from multiprocessing import Pool
from typing import Callable, List
import pandas as pd
class ParallelForecaster:
def _process_group(
self,
df: pd.DataFrame,
func: Callable,
**kwargs,
) -> pd.DataFrame:
uid = df["unique_id"].iloc[0]
_df = df.drop("unique_id", axis=1)
res_df = func(_df, **kwargs)
res_df.insert(0, "unique_id", uid)
return res_df
def _apply_parallel(
self,
df_grouped: pd.DataFrame,
func: Callable,
**kwargs,
) -> pd.DataFrame:
with Pool(os.cpu_count() - 1) as executor:
futures = [
executor.apply_async(
self._process_group,
args=(df, func),
kwds=kwargs,
)
for _, df in df_grouped
]
results = [future.get() for future in futures]
return pd.concat(results)
def _local_forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
quantiles: List[float] | None = None,
) -> pd.DataFrame:
raise NotImplementedError
def forecast(
self,
df: pd.DataFrame,
h: int,
freq: str,
quantiles: List[float] | None = None,
) -> pd.DataFrame:
fcst_df = self._apply_parallel(
df.groupby("unique_id"),
self._local_forecast,
h=h,
freq=freq,
quantiles=quantiles,
)
return fcst_df
import logging
from concurrent.futures import ProcessPoolExecutor
import pandas as pd
logging.basicConfig(level=logging.INFO)
main_logger = logging.getLogger(__name__)
def read_parquet_and_assign(uid, url):
df = pd.read_parquet(url)
df["unique_id"] = uid
df["ds"] = df["ds"].astype(str)
return df[["unique_id", "ds", "y"]]
def download_data():
catalogue_splits = pd.read_csv("./data/series_catalogue_hourly.csv")
catalogue_df = catalogue_splits.query("dataset == 'moirai'")
catalogue_df["pandas_frequency"] = "H"
catalogue_df["seasonality"] = 24
catalogue_df["horizon"] = 24
catalogue_df = catalogue_df.query("split == 'test'")[
[
"unique_id",
"frequency",
"url",
"pandas_frequency",
"seasonality",
"horizon",
]
]
grouped_df = catalogue_df.groupby(["frequency", "pandas_frequency"])
for (frequency, pandas_frequency), df in grouped_df:
uids, urls = df["unique_id"].values, df["url"].values
main_logger.info(
f"frequency: {frequency}, pandas_frequency: {pandas_frequency}"
)
n_uids = len(uids)
main_logger.info(f"number of uids: {n_uids}")
max_workers = min(10, n_uids)
with ProcessPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(read_parquet_and_assign, uid, url)
for uid, url in zip(uids, urls)
]
results = [future.result() for future in futures]
main_logger.info("dataset read")
Y_df = pd.concat(results)
Y_df = Y_df.merge(
df.drop(columns="url"),
on="unique_id",
how="left",
)
# Y_df.to_parquet(f"./data/{frequency}_{pandas_frequency}.parquet")
Y_df.to_parquet(f"./data/filtered_datasets/moirai-data.parquet")
del Y_df
main_logger.info("dataset saved")
if __name__ == "__main__":
download_data()
import warnings
from dataclasses import dataclass, asdict
from functools import partial
from pathlib import Path
from typing import Any, Callable, List
import pandas as pd
from utilsforecast.evaluation import evaluate
from utilsforecast.losses import mae, _zero_to_nan
from .logger_config import setup_logger
warnings.simplefilter(
action="ignore",
category=FutureWarning,
)
main_logger = setup_logger(__name__)
def mase(
df: pd.DataFrame,
models: List[str],
seasonality: int,
train_df: pd.DataFrame,
id_col: str = "unique_id",
target_col: str = "y",
) -> pd.DataFrame:
mean_abs_err = mae(df, models, id_col, target_col)
mean_abs_err = mean_abs_err.set_index(id_col)
# assume train_df is sorted
lagged = train_df.groupby(id_col, observed=True)[target_col].shift(seasonality)
scale = train_df[target_col].sub(lagged).abs()
scale = scale.groupby(train_df[id_col], observed=True).mean()
scale[scale < 1e-2] = 0.0
res = mean_abs_err.div(_zero_to_nan(scale), axis=0).fillna(0)
res.index.name = id_col
res = res.reset_index()
return res
def generate_train_cv_splits(
df: pd.DataFrame,
cutoffs: pd.DataFrame,
) -> pd.DataFrame:
"""
based on `cutoffs` (columns `unique_id`, `cutoffs`)
generates train cv splits using `df`
"""
df = df.merge(cutoffs, on="unique_id", how="outer")
df = df.query("ds <= cutoff")
df = df.reset_index(drop=True)
return df
@dataclass
class DatasetParams:
frequency: str
pandas_frequency: str
horizon: int
seasonality: int
@staticmethod
def _get_value_from_df_col(
df: pd.DataFrame,
col: str,
dtype: Callable | None = None,
) -> Any:
col_values = df[col].unique()
if len(col_values) > 1:
raise ValueError(f"{col} is not unique: {col_values}")
value = col_values[0]
if dtype is not None:
value = dtype(value)
return value
@classmethod
def from_df(cls, df: pd.DataFrame) -> "DatasetParams":
dataset_params = {}
dataset_params_cols = [
"frequency",
"pandas_frequency",
"horizon",
"seasonality",
]
dataset_params_cols_dtypes = [str, str, int, int]
for col, dtype in zip(dataset_params_cols, dataset_params_cols_dtypes):
dataset_params[col] = cls._get_value_from_df_col(df, col, dtype=dtype)
return cls(**dataset_params)
@dataclass
class ExperimentDataset(DatasetParams):
df: pd.DataFrame
@classmethod
def from_df(cls, df: pd.DataFrame) -> "ExperimentDataset":
"""
Parameters
----------
df : pd.DataFrame
df should have columns:
unique_id, ds, y, frequency, pandas_frequency, horizon, seasonality
"""
ds_params = DatasetParams.from_df(df=df)
df = df[["unique_id", "ds", "y"]] # type: ignore
return cls(
df=df,
**asdict(ds_params),
)
@classmethod
def from_parquet(
cls,
parquet_path: str | Path,
) -> "ExperimentDataset":
df = pd.read_parquet(parquet_path)
return cls.from_df(df=df)
def evaluate_forecast_df(
self,
forecast_df: pd.DataFrame,
models: List[str],
) -> pd.DataFrame:
"""
Parameters
----------
forecast_df : pd.DataFrame
df should have columns: unique_id, ds, cutoff, y, and models
"""
for model in models:
if forecast_df[model].isna().sum() > 0:
print(forecast_df.loc[forecast_df[model].isna()]["unique_id"].unique())
raise ValueError(f"model {model} has NaN values")
cutoffs = forecast_df[["unique_id", "cutoff"]].drop_duplicates()
train_cv_splits = generate_train_cv_splits(df=self.df, cutoffs=cutoffs)
def add_id_cutoff(df: pd.DataFrame):
df["id_cutoff"] = (
df["unique_id"].astype(str) + "-" + df["cutoff"].astype(str)
)
for df in [cutoffs, train_cv_splits, forecast_df]:
add_id_cutoff(df)
partial_mase = partial(mase, seasonality=self.seasonality)
eval_df = evaluate(
df=forecast_df,
train_df=train_cv_splits,
metrics=[partial_mase],
models=models,
id_col="id_cutoff",
)
eval_df = eval_df.merge(cutoffs, on=["id_cutoff"])
eval_df = eval_df.drop(columns=["id_cutoff"])
eval_df = eval_df[["unique_id", "cutoff", "metric"] + models]
return eval_df
@dataclass
class ForecastDataset:
forecast_df: pd.DataFrame
time_df: pd.DataFrame
@classmethod
def from_dir(cls, dir: str | Path):
dir_ = Path(dir)
forecast_df = pd.read_parquet(dir_ / "forecast_df.parquet")
time_df = pd.read_parquet(dir_ / "time_df.parquet")
return cls(forecast_df=forecast_df, time_df=time_df)
@staticmethod
def is_forecast_ready(dir: str | Path):
dir_ = Path(dir)
forecast_path = dir_ / "forecast_df.parquet"
time_path = dir_ / "time_df.parquet"
return forecast_path.exists() and time_path.exists()
def save_to_dir(self, dir: str | Path):
dir_ = Path(dir)
dir_.mkdir(parents=True, exist_ok=True)
self.forecast_df.to_parquet(dir_ / "forecast_df.parquet")
self.time_df.to_parquet(dir_ / "time_df.parquet")
"""
this module takes Nixtla's benchmarking data
and filters it to prevent azureml from crashing
in the following cases:
- too short series, see https://learn.microsoft.com/en-us/azure/machine-learning/concept-automl-forecasting-methods?view=azureml-api-2#data-length-requirements
"""
import logging
from pathlib import Path
import fire
import numpy as np
import pandas as pd
main_logger = logging.getLogger(__name__)
main_logger.setLevel(logging.INFO)
def get_min_size_per_series(dataset_path: str) -> int:
if "Daily" in dataset_path or "Hourly" in dataset_path:
return 1_000
elif "Monthly" in dataset_path:
return 10 * 12
else:
return 1_000 // 7
def filter_and_clean_dataset(
dataset_path: str,
max_series: int = 1_000,
random_seed: int = 420,
):
main_logger.info(f"Processing dataset {dataset_path}")
df = pd.read_parquet(dataset_path)
df = df.drop_duplicates(["unique_id", "ds"]) # type: ignore
df = df.sort_values(["unique_id", "ds"])
min_size_per_series = get_min_size_per_series(dataset_path)
df = (
df.groupby("unique_id")
.filter(lambda x: len(x) >= min_size_per_series)
.reset_index(drop=True)
)
uids = df["unique_id"].unique() # type: ignore
if len(uids) > max_series:
np.random.seed(random_seed)
uids = np.random.choice(uids, max_series, replace=False) # type: ignore
df = df.query("unique_id in @uids") # type: ignore
main_logger.info(f"Filtering out {len(uids) - max_series} series")
n_series = len(df["unique_id"].unique()) # type: ignore
main_logger.info(f"Number of series: {n_series}")
if n_series == 0:
raise ValueError("No series left after filtering")
# finally we clean some strange dates
mask = df["ds"].str.endswith(":01") # type: ignore
df.loc[mask, "ds"] = df.loc[mask, "ds"].str[:-3] + ":00"
# save the dataset
dataset_path = Path(dataset_path) # type: ignore
filtered_dataset_path = dataset_path.parent / "filtered_datasets" / dataset_path.name # type: ignore
filtered_dataset_path.parent.mkdir(exist_ok=True, parents=True)
df.to_parquet(filtered_dataset_path)
main_logger.info(f"Filtered dataset saved to {filtered_dataset_path}")
if __name__ == "__main__":
fire.Fire(filter_and_clean_dataset)
import logging
def setup_logger(logger_name, log_file=None):
logger = logging.getLogger(logger_name)
logger.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s,%(levelname)s,%(module)s,%(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
return logger
download_lag_llama_code:
@git clone https://github.com/time-series-foundation-models/lag-llama tempdir
@cp -R tempdir/data/ .
@cp -R tempdir/gluon_utils/ .
@cp -R tempdir/lag_llama/ .
@cp -R tempdir/requirements.txt lag-llama-requirements.txt
@rm -rf tempdir
download_lag_llama_model:
@huggingface-cli download time-series-foundation-models/Lag-Llama lag-llama.ckpt --local-dir ./models/
# LagLLama is 40% less accurate than a simple SeasonalNaive and 1000x slower.
We present a fully reproducible experiment showing that SeasonalNaive significantly outperforms LagLlama, a recently introduced open-source foundational model for time series forecasting (a deep learning architecture pre-trained on time series datasets). Specifically, **SeasonalNaive achieves 42%, 24%, and 16% better performance** in terms of MASE, MAPE, and CRPS respectively, and boasts **a 1,000x speed advantage**. These findings are based on an extensive analysis covering 105,289 unique time series from the M1, M3, M4, and Tourism datasets, which were omitted in the original LagLlama paper.
# Introduction
In the field of time series forecasting, recent developments have introduced foundational models such as LagLlama, which utilizes deep learning and extensive data for pretraining, aiming to enhance predictive performance and model complexity. LagLLama is to be praised as one of the first open-source foundational models. However, contrary to expectations, our analysis indicates that the traditional SeasonalNaive model, known for its straightforward approach of extending past seasonal trends into future predictions, outperforms LagLlama in terms of both accuracy and computational efficiency.
## Empirical Evaluation
The original paper uses 3,113 time series to assess the model performance. The original paper only reports CRPS and omits point forecast error metrics widely used in academia and industry, e.g. MASE and MAPE.
Our evaluation encompasses 105,289 unique time series from different datasets, including M1, M3, M4, and Tourism, covering yearly, quarterly, monthly, weekly, daily, and hourly frequencies. This diverse dataset selection allows for a robust assessment of the models across various time series characteristics and forecasting horizons. We also reproduce results for Pedestrian Counts and Weather originally included in the paper/code to show that we are running LagLlama correctly.
## Results
The results are summarized in the following table, highlighting the performance metrics of MASE, MAPE, CRPS, and TIME (measured in seconds). The best results are indicated in **bold** for easy reference.
<img width="953" alt="image" src="https://github.com/Nixtla/nixtla/assets/10517170/8e65338d-930e-4837-8bf5-2e7aeddad5cc">
## Reproducibility
To ensure the reproducibility of our findings, the experiments were conducted on an AWS g5.4xlarge GPU instance equipped with 16 vCPUs, 64 GiB of RAM, and an NVIDIA A10G Tensor Core GPU (24 GiB). The complete code can be found in this repo.
### Instructions
1. Create a python environment using:
```
mamba env create -f environment.yml
conda activate lag-llama
```
2. Add lag-llama code to your environment
```
make download_lag_llama_code
```
5. Download lag-llama model
```
make download_lag_llama_model
```
4. Install lag-llama requirements
```
pip install -r lag-llama-requirements.txt
```
5. Run complete experiments reported in the table
```
python -m src.main
```
### References
- **Lag-Llama Paper**: [Towards Foundation Models for Probabilistic Time Series Forecasting](https://arxiv.org/abs/2310.08278)
- **SeasonalNaive Implementation**: [GitHub Repository](https://github.com/nixtla/statsforecast/)
- **CRPS Replication Note**: The CRPS performance for `LagLlama` is replicated from the model's publicly available [Colab notebook](https://colab.research.google.com/drive/13HHKYL_HflHBKxDWycXgIUAHSeHRR5eo?usp=sharing), ensuring a fair comparison.
name: lag-llama
channels:
- conda-forge
- defaults
- anaconda
dependencies:
- jupyterlab
- pip
- python=3.10
- pip:
- datasetsforecast
- fire
- huggingface_hub[cli]
- neuralforecast
- orjson
- statsforecast
- utilsforecast
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment