readme

f42429f6 · bailuo · f42429f6 · f42429f6 · f42429f6 · f42429f6
Commit f42429f6 authored Nov 19, 2025 by bailuo
20 changed files
--- a/experiments/azure-automl-forecasting/src/utils/data_handler.py
+++ b/experiments/azure-automl-forecasting/src/utils/data_handler.py
+import logging
+import warnings
+from dataclasses import dataclass, asdict
+from functools import partial
+from pathlib import Path
+
+import pandas as pd
+from utilsforecast.evaluation import evaluate
+from utilsforecast.losses import rmse, mae, mase
+
+from src.utils.filter_data import DatasetParams
+
+warnings.simplefilter(action="ignore", category=FutureWarning)
+logging.basicConfig(level=logging.INFO)
+main_logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ExperimentDataset:
+    Y_df_train: pd.DataFrame
+    Y_df_test: pd.DataFrame
+    horizon: int
+    seasonality: int
+    frequency: str
+    pandas_frequency: str
+
+    @classmethod
+    def from_df(cls, df: pd.DataFrame) -> "ExperimentDataset":
+        """
+        Parameters
+        ----------
+        df : pd.DataFrame
+            df should have columns: unique_id, ds, y, frequency, pandas_frequency, horizon, seasonality
+        """
+        ds_params = DatasetParams.from_df(df)
+        df = df[["unique_id", "ds", "y"]]  # type: ignore
+        Y_df_test = df.groupby("unique_id").tail(ds_params.horizon)
+        Y_df_train = df.drop(Y_df_test.index)  # type: ignore
+        return cls(
+            Y_df_train=Y_df_train,
+            Y_df_test=Y_df_test,
+            **asdict(ds_params),
+        )
+
+    @classmethod
+    def from_parquet(
+        cls,
+        parquet_path: str,
+    ) -> "ExperimentDataset":
+        df = pd.read_parquet(parquet_path)
+        return cls.from_df(df=df)
+
+    def evaluate_forecast_df(
+        self,
+        forecast_df: pd.DataFrame,
+        model: str,
+        total_time: float,
+    ) -> pd.DataFrame:
+        df_ = self.Y_df_test.copy(deep=True)
+        if forecast_df.dtypes["ds"] != df_.dtypes["ds"]:
+            df_["ds"] = df_["ds"].astype(forecast_df.dtypes["ds"])
+        df = df_.merge(
+            forecast_df[["unique_id", "ds", model]],
+            on=["unique_id", "ds"],
+            how="left",
+        )
+        if df[model].isna().sum() > 0:
+            na_uids = df.loc[df[model].isna()]["unique_id"].unique()
+            main_logger.warning(
+                f"{model} contains NaN for {len(na_uids)} series: {na_uids}"
+                "filling with last values"
+            )
+            from statsforecast import StatsForecast
+            from statsforecast.models import SeasonalNaive
+
+            sf = StatsForecast(
+                models=[SeasonalNaive(season_length=self.seasonality)],
+                freq=self.pandas_frequency,
+            )
+            sn_df = sf.forecast(
+                df=self.Y_df_train,
+                h=self.horizon,
+            )
+            df = df.merge(sn_df, on=["unique_id", "ds"], how="left")  # type: ignore
+            df.loc[df["unique_id"].isin(na_uids), model] = df.loc[
+                df["unique_id"].isin(na_uids), "SeasonalNaive"
+            ]
+            df = df.drop(columns=["SeasonalNaive"])
+        partial_mase = partial(mase, seasonality=self.seasonality)
+        eval_df = evaluate(
+            df=df,
+            metrics=[rmse, mae, partial_mase],
+            train_df=self.Y_df_train,
+            models=[model],
+        )
+        eval_df = eval_df.groupby("metric").mean(numeric_only=True).reset_index()  # type: ignore
+        eval_time_df = pd.DataFrame(
+            {
+                "metric": ["total_time"],
+                model: [total_time],
+            }
+        )
+        eval_df = pd.concat(
+            [eval_df, eval_time_df],
+            ignore_index=True,
+        )  # type: ignore
+        return eval_df.set_index("metric")
+
+
+@dataclass
+class ForecastDataset:
+    forecast_df: pd.DataFrame
+    total_time: float
+
+    @classmethod
+    def from_dir(cls, dir: str | Path):
+        dir_ = Path(dir)
+        forecast_df = pd.read_parquet(dir_ / "forecast_df.parquet")
+        with open(dir_ / "total_time.txt", "r") as file:
+            total_time = float(file.read())
+        return cls(forecast_df=forecast_df, total_time=total_time)
+
+    @staticmethod
+    def is_forecast_ready(dir: str | Path):
+        dir_ = Path(dir)
+        forecast_path = dir_ / "forecast_df.parquet"
+        time_path = dir_ / "total_time.txt"
+        return forecast_path.exists() and time_path.exists()
+
+    def save_to_dir(self, dir: str | Path):
+        dir_ = Path(dir)
+        dir_.mkdir(parents=True, exist_ok=True)
+        self.forecast_df.to_parquet(dir_ / "forecast_df.parquet")
+        with open(dir_ / "total_time.txt", "w") as file:
+            file.write(str(self.total_time))
--- a/experiments/azure-automl-forecasting/src/utils/download_data.py
+++ b/experiments/azure-automl-forecasting/src/utils/download_data.py
+import logging
+from concurrent.futures import ProcessPoolExecutor
+
+import pandas as pd
+
+logging.basicConfig(level=logging.INFO)
+main_logger = logging.getLogger(__name__)
+
+
+def read_parquet_and_assign(uid, url):
+    df = pd.read_parquet(url)
+    df["unique_id"] = uid
+    df["ds"] = df["ds"].astype(str)
+    return df[["unique_id", "ds", "y"]]
+
+
+def download_data():
+    catalogue_splits = pd.read_parquet("./data/catalogue_splits.parquet")
+    catalogue_datasets = pd.read_parquet("./data/catalogue_datasets.parquet")
+    catalogue_df = catalogue_splits.merge(
+        catalogue_datasets,
+        on=["dataset", "subdataset", "frequency"],
+    )
+    del catalogue_splits
+    del catalogue_datasets
+    catalogue_df = catalogue_df.query("split == 'test'")[
+        [
+            "unique_id",
+            "frequency",
+            "url",
+            "pandas_frequency",
+            "seasonality",
+            "horizon",
+        ]
+    ]
+    grouped_df = catalogue_df.groupby(["frequency", "pandas_frequency"])
+    for (frequency, pandas_frequency), df in grouped_df:
+        uids, urls = df["unique_id"].values, df["url"].values
+        main_logger.info(
+            f"frequency: {frequency}, pandas_frequency: {pandas_frequency}"
+        )
+        n_uids = len(uids)
+        main_logger.info(f"number of uids: {n_uids}")
+        max_workers = min(10, n_uids)
+        with ProcessPoolExecutor(max_workers=max_workers) as executor:
+            futures = [
+                executor.submit(read_parquet_and_assign, uid, url)
+                for uid, url in zip(uids, urls)
+            ]
+            results = [future.result() for future in futures]
+        main_logger.info("dataset read")
+        Y_df = pd.concat(results)
+        Y_df = Y_df.merge(
+            df.drop(columns="url"),
+            on="unique_id",
+            how="left",
+        )
+        Y_df.to_parquet(f"./data/{frequency}_{pandas_frequency}.parquet")
+        del Y_df
+        main_logger.info("dataset saved")
+
+
+if __name__ == "__main__":
+    download_data()
--- a/experiments/azure-automl-forecasting/src/utils/filter_data.py
+++ b/experiments/azure-automl-forecasting/src/utils/filter_data.py
+"""
+this module takes Nixtla's benchmarking data 
+and filters it to prevent azureml from crashing
+in the following cases:
+- too short series, see https://learn.microsoft.com/en-us/azure/machine-learning/concept-automl-forecasting-methods?view=azureml-api-2#data-length-requirements
+"""
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable
+
+import fire
+import numpy as np
+import pandas as pd
+
+logging.basicConfig(level=logging.INFO)
+main_logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DatasetParams:
+    frequency: str
+    pandas_frequency: str
+    horizon: int
+    seasonality: int
+
+    @staticmethod
+    def _get_value_from_df_col(
+        df: pd.DataFrame,
+        col: str,
+        dtype: Callable | None = None,
+    ) -> Any:
+        col_values = df[col].unique()
+        if len(col_values) > 1:
+            raise ValueError(f"{col} is not unique: {col_values}")
+        value = col_values[0]
+        if dtype is not None:
+            value = dtype(value)
+        return value
+
+    @classmethod
+    def from_df(cls, df: pd.DataFrame) -> "DatasetParams":
+        dataset_params = {}
+        dataset_params_cols = [
+            "frequency",
+            "pandas_frequency",
+            "horizon",
+            "seasonality",
+        ]
+        dataset_params_cols_dtypes = [str, str, int, int]
+        for col, dtype in zip(dataset_params_cols, dataset_params_cols_dtypes):
+            dataset_params[col] = cls._get_value_from_df_col(df, col, dtype=dtype)
+        return cls(**dataset_params)
+
+
+def filter_and_clean_dataset(
+    dataset_path: str,
+    max_series: int = 1_000,
+    n_train_cv: int = 2,
+    n_seasonalities: int = 5,
+    max_insample_length: int = 3_000,
+    random_seed: int = 420,
+):
+    main_logger.info(f"Processing dataset {dataset_path}")
+    df = pd.read_parquet(dataset_path)
+    df = df.drop_duplicates(["unique_id", "ds"])  # type: ignore
+    df = df.sort_values(["unique_id", "ds"])
+    ds_params = DatasetParams.from_df(df)
+    min_train_size_per_series = (
+        ds_params.horizon
+        + 2 * ds_params.horizon
+        + (n_train_cv - 1) * ds_params.horizon
+        + 1
+    )
+    if ds_params.seasonality < 100:
+        # if series has low seasonality
+        # we add n_seasonalities to min_train_size_per_series
+        # to keep the series long enough
+        min_train_size_per_series += n_seasonalities * ds_params.seasonality
+    uids = df["unique_id"].unique()  # type: ignore
+    df = (
+        df.groupby("unique_id")
+        .filter(lambda x: len(x) >= min_train_size_per_series)
+        .groupby("unique_id")  # type: ignore
+        .tail(max_insample_length + ds_params.horizon)
+        .reset_index(drop=True)
+    )
+    main_logger.info(
+        f"Filtering out {len(uids) - len(df['unique_id'].unique())} series"
+    )
+    uids = df["unique_id"].unique()  # type: ignore
+    if len(uids) > max_series:
+        np.random.seed(random_seed)
+        uids = np.random.choice(uids, max_series, replace=False)  # type: ignore
+        df = df.query("unique_id in @uids")  # type: ignore
+        main_logger.info(f"Filtering out {len(uids) - max_series} series")
+    # finally we clean some strange dates
+    mask = df["ds"].str.endswith(":01")  # type: ignore
+    df.loc[mask, "ds"] = df.loc[mask, "ds"].str[:-3] + ":00"
+    # save the dataset
+    dataset_path = Path(dataset_path)  # type: ignore
+    filtered_dataset_path = dataset_path.parent / "filtered_datasets" / dataset_path.name  # type: ignore
+    filtered_dataset_path.parent.mkdir(exist_ok=True, parents=True)
+    df.to_parquet(filtered_dataset_path)
+    main_logger.info(f"Filtered dataset saved to {filtered_dataset_path}")
+
+
+if __name__ == "__main__":
+    fire.Fire(filter_and_clean_dataset)
--- a/experiments/efficiency/README.md
+++ b/experiments/efficiency/README.md
+# 🚀 TimeGPT API v2: Faster, Smarter, and More Powerful Time Series Forecasting! 🚀
+
+We’re excited to introduce **v2 of the TimeGPT API**, featuring a significant boost in performance, enhanced flexibility, and new capabilities that make time series forecasting faster and more insightful than ever before.
+
+In this release, you will find:
+- **Dramatic speed improvements** across all major endpoints 🏎️
+- **Scalable forecasting** that handles 1 billion time series in just 6 hours 📊
+- **Advanced handling of exogenous variables**, both historical and future 🌐
+- **Enhanced explainability** through SHAP values 🧠
+- **New integration with Polars**, a high-performance DataFrame library ⚡
+
+## Key Performance Highlights 🔥
+
+We've optimized the core functionalities—forecasting, anomaly detection, and cross-validation—with v2 showing significant speedups compared to v1. Below are the benchmark results:
+
+| Endpoint          | Features   | Level   | v1   | v2   | Speedup   |
+|:------------------|:-----------|:--------|:-----|:-----|:----------|
+| anomaly_detection | exog       | [80]    | 24s  | 3s   | 9x        |
+| anomaly_detection | none       | [80]    | 13s  | 2s   | 8x        |
+| cross_validation  | exog       | None    | 22s  | 4s   | 6x        |
+| cross_validation  | exog       | [80]    | 31s  | 6s   | 5x        |
+| cross_validation  | none       | None    | 5s   | 1s   | 9x        |
+| cross_validation  | none       | [80]    | 9s   | 2s   | 4x        |
+| forecast          | exog       | None    | 18s  | 1s   | 13x       |
+| forecast          | exog       | [80]    | 20s  | 2s   | 10x       |
+| forecast          | none       | None    | 1s   | 0s   | 6x        |
+| forecast          | none       | [80]    | 3s   | 1s   | 6x        |
+
+These results represent the huge leap in efficiency v2 provides, allowing you to analyze vast datasets and derive insights faster than ever before. 🚀
+
+## How to Reproduce Results
+
+### Installation 🛠️
+
+1. Install the required Python packages:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+### Running the Code 🏃‍♀️
+
+This script benchmarks **forecasting**, **anomaly detection**, and **cross-validation** across both v1 and v2 of the TimeGPT API. You can run the script and compare performance results by executing:
+
+```bash
+python main.py
+```
+
+## Performance Breakdown 🏎️
+
+With v2, you get **up to 13x speed improvements** on key operations like forecasting with exogenous variables. This makes the API ideal for production environments where performance and scalability are paramount.
+
+### New Features in v2
+
+- **Advanced Exogenous Variable Handling**: Leverage both historical and future exogenous data for more accurate forecasts.
+- **SHAP Values**: Improve model interpretability with SHAP value integration.
+- **Polars Integration**: Benefit from lightning-fast data processing with Polars, especially useful for big datasets. 
+
+## Conclusion 🚀
+
+With TimeGPT API v2, you’re not just getting a faster API—you’re gaining the tools to scale up your time series analysis effortlessly, with greater precision and deeper insights. Whether it’s detecting anomalies, validating models, or producing reliable forecasts, v2 ensures you get results **faster and smarter** than ever before.
+
+Happy forecasting! 
--- a/experiments/efficiency/main.py
+++ b/experiments/efficiency/main.py
+import itertools
+import logging
+import time
+
+import pandas as pd
+from nixtla import NixtlaClient as V2Client
+from nixtlats import NixtlaClient as V1Client
+from utilsforecast.data import generate_series
+from utilsforecast.feature_engineering import fourier
+
+logging.getLogger("nixtla").setLevel(logging.ERROR)
+logging.getLogger("nixtlats").setLevel(logging.ERROR)
+
+
+def forecast(client, df, X_df, h, level):
+    return client.forecast(df=df, X_df=X_df, h=h, level=level)
+
+
+def cross_validation(client, df, X_df, h, level):
+    return client.cross_validation(df=df, h=h, n_windows=4, level=level)
+
+
+def anomaly_detection(client, df, X_df, h, level):
+    if isinstance(level, list):
+        level = level[0]
+    return client.detect_anomalies(df=df, level=level)
+
+
+v1_client = V1Client()
+v2_client = V2Client()
+n_series = 1_000
+freq = "D"
+h = 14
+series = generate_series(n_series, freq=freq, min_length=200)
+train, future = fourier(series, freq=freq, season_length=7, k=4, h=h)
+features = ["none", "exog"]
+level = [None, [80]]
+clients = {"v1": v1_client, "v2": v2_client}
+methods = {
+    "forecast": forecast,
+    "cross_validation": cross_validation,
+    "anomaly_detection": anomaly_detection,
+}
+times = {version: {} for version in ("v1", "v2")}
+for feats, lvl in itertools.product(features, level):
+    if feats == "none":
+        df = series
+        X_df = None
+    else:
+        df = train
+        X_df = future
+    for name, method in methods.items():
+        if name == "anomaly_detection" and lvl is None:
+            continue
+        for version, client in clients.items():
+            start = time.perf_counter()
+            combination = f"{version} {name}. Features: {feats}. Level: {lvl}"
+            print(f"Running {combination}")
+            res = method(client, df=df, X_df=X_df, h=h, level=lvl)
+            time_taken = time.perf_counter() - start
+            times[version][f"{name}-{feats}-{lvl}"] = time_taken
+            print(f"{combination} took {time_taken:.1f} seconds.")
+
+df = pd.DataFrame(times)
+df.index = df.index.str.split("-", expand=True)
+df.index.names = ["endpoint", "features", "level"]
+df = df.sort_index()
+df["speedup"] = df["v1"] / df["v2"]
+df["speedup"] = df["speedup"].map("{:.0f}x".format)
+for col in ("v1", "v2"):
+    df[col] = df[col].map("{:.0f}s".format)
+with open("endpoint_times.md", "wt") as f:
+    f.write(df.reset_index().to_markdown(index=False))
--- a/experiments/efficiency/requirements.txt
+++ b/experiments/efficiency/requirements.txt
+nixtla>=0.6
+nixtlats==0.5.2
+pandas
--- a/experiments/foundation-time-series-arena/.env.example
+++ b/experiments/foundation-time-series-arena/.env.example
+NIXTLA_API_KEY=
+NIXTLA_BASE_URL=
--- a/experiments/foundation-time-series-arena/Makefile
+++ b/experiments/foundation-time-series-arena/Makefile
+download_lag_llama_code:
+	@git clone https://github.com/time-series-foundation-models/lag-llama tempdir
+	@cp -R tempdir/data/ .
+	@cp -R tempdir/gluon_utils/ .
+	@cp -R tempdir/lag_llama/ .
+	@rm -rf tempdir
+
+download_data:
+	@aws s3 sync s3://nixtla-foundational-time-series/data nixtla-foundational-time-series/data --no-sign-request
--- a/experiments/foundation-time-series-arena/README.md
+++ b/experiments/foundation-time-series-arena/README.md
+# Benchmarking foundation models for time series
+
+> TL;DR: Foundation models for time series outperform alternatives and are ready to be tested in production. TimeGPT-1 is (so far) the most accurate and fastest model but TimesFM from Google comes very close. Some models are still outperformed by classical alternatives.
+
+Notes: 
+* The Amazon team responded to the original benchmark with this [PR](https://github.com/Nixtla/nixtla/pull/382) that shows, according to them, that by changing some parameters, Chronos is significantly faster and more accurate.
+* The SalesForce team also responded with this [PR](https://github.com/Nixtla/nixtla/pull/389) showing improved accuracy and perfomance.
+We are currently reviewing both PRs.
+
+# Introduction
+
+We present a reproducible benchmark comparing different foundation models across a wide variety of models in a large scale dataset.  
+
+We conclude that [TimeGPT-1](https://arxiv.org/abs/2310.03589b) ranks first in terms of accuracy and speed inference compared to the latest foundation models, including [TimesFM](https://arxiv.org/pdf/2310.10688) (Google), [Chronos](https://arxiv.org/abs/2403.07815) (Amazon), [Moirai](https://arxiv.org/abs/2402.02592) (SalesForece), and [Lag-LLama](https://arxiv.org/pdf/2310.08278) (Service Now). `TimeGPT-1` and `TimesFM` also outperform established statistical, machine learning, and deep-learning models, with comparable inference times to a `SeasonalNaive`. `Chronos`, `Moirai` and `Lag-Llama` still need some further improvements and can be outperformed by other classical methods.
+
+This analysis spans over **30,000 unique time series** across various domains and frequencies from M-Competitions, Monash Repository, and Wikipedia page views, among others, robustly comparing these models.
+
+# Zero-shot foundation models
+The rise of zero-shot foundational models in time series forecasting, such as `TimeGPT-` by [Nixtla](https://github.com/Nixtla/), `TimesFM` by Google or `Chronos` by Amazon, represents a significant leap forward in our field. The promise of this innovation is to allow practitioners to accurately forecast without having to train their own models. If foundation models succeed, this would make forecasting and anomaly detection much easier, faster, and, in many cases, more accurate than state-of-the-art alternatives. 
+
+We have also seen some of these models being offered as out-of-the-box solutions. We at Nixtla [recently announced](https://techcommunity.microsoft.com/t5/ai-machine-learning-blog/announcing-timegen-1-in-azure-ai-leap-forward-in-time-series/ba-p/4140446) that `TimeGPT-1` and `TimeGEN-1` are now available on both Azure and our own platform. Google will also release a version of `TimesFM` on VertexAI, and it wouldn't be surprising if Amazon is trying to do the same for Bedrock.
+
+We at Nixtla have provided some [early success stories](https://techcommunity.microsoft.com/t5/ai-machine-learning-blog/announcing-timegen-1-in-azure-ai-leap-forward-in-time-series/ba-p/4140446) of real companies leveraging the simplicity and accuracy of `TimeGPT-1` and we are sure that more positive examples will follow from other models.
+
+However, the field [is still divided](https://news.ycombinator.com/item?id=39235983) on how all the different foundation models compare against each other. In the spirit of collaboration, we are starting a new project, `xiuhmolpilli`, in honor of how ancient civilizations celebrated the end of cycles, to build a benchmark to compare all the different foundation models for time series data in a large scale dataset and against classical, ML and Deep Learning Models.
+
+
+
+# Empirical Evaluation
+
+This study considers **over 30,000 unique time series** from the Monash Repository, M-Competitions, Wikipedia page views, among others, spanning various time series frequencies: Monthly, Weekly, Daily, and Hourly. Our evaluation compares five foundation models for time series data in terms of accuracy and inference times. We have also included comparisons to a large battery of statistical, machine learning, and deep-learning models, to provide a benchmark against traditional forecasting methods.
+
+We include the following models in our comprehensive evaluation:
+
+- [Statistical](https://github.com/Nixtla/statsforecast/): `SeasonalNaive`, `HistoricAverage`, `ZeroModel`, `AutoARIMA`, `Prophet`, `AutoCES`, `AutoETS`, `Theta`, `DynamicOptimizedTheta`, `ADIDA`, `IMAPA`, and `CrostonClassic`.
+- [Machine Learning](https://github.com/Nixtla/mlforecast/): `AutoLGBM`. 
+- [Deep Learning](https://github.com/Nixtla/neuralforecast/): `AutoTFT`, `AutoNHITS`.
+- Foundation: `Chronos`, `Lag-Llama`, `Moirai`, `TimeGPT`, `TimeGPT` (long horizon), and `TimesFM`. 
+
+## Results
+
+`TimeGPT-1` ranks first in terms of accuracy and speed inference compared to the latest foundation models, including `TimesFM`, `Chronos`, `Moirai`, and `Lag-Llama`. `TimesFM` by Google ranks second in accuracy and outperfoms `TimeGPT-1` in inference speed. Amazon `Chronos` ranks third in accuracy but shows a significant drop in inference speed. Both Salesforces's and ServiceNow's models are far more efficient in terms of inference speed than `Chronos`, but they rank lower in terms of accuracy.
+
+The following image shows the average performance ranking based on `MASE` for the four frequencies:
+
+![image](https://github.com/Nixtla/nixtla/assets/10517170/ed9b66f9-afd5-49c6-b736-48cb9b239de8)
+
+
+Our findings are shown in the following table, showcasing the performance measured by `MASE` and computational inference time (in minutes). The best model is highlighted in **bold** and the second best is underlined for ease of reference.
+
+![image](https://github.com/Nixtla/nixtla/assets/10517170/1c042591-0585-4a5b-a548-2017a28f2d4f)
+
+
+We also present a plot comparing accuracy and speed across foundation models.
+
+![image](https://github.com/Nixtla/nixtla/assets/10517170/2a6a630e-c9db-4530-8ef2-86db3c85d8a9)
+
+
+Some noteworthy observations from the results include:
+- `TimeGPT-1` consistenly achieves the best overall ranking among all models, with comparable inference times to the simplest baselines and `TimesFM`.
+- `TimesFM` ranks second among all foundation models, with performance slightly worse than `TimeGPT-1`.
+- `Chronos` ranks third in performance, but with extremelly high inference times, reducing it's utility as a pre-trained foundation model. For reference, it is possible to fully train while performming automatic hyperparameter selection of state-of-the-art deep-learning models in less time than `Chronos` zero-shot inference time.
+- `Moirai` and `Lag-Llama` rank last among foundation models and are often outperformed by almost all statistical, ML, and deep learning models.
+- While `Prophet` is still widely used by practitioners, it consistently ranks lower than all the methods considered.
+
+
+## Challenges to benchmark foundation models
+
+There are two main challenges we faced to correctly compare foundation models:
+
+- creating a brand-new framework capable of running various methods and algorithms, the framework presented here, xiuhmolpilli, was designed as an abstraction of the different ways foundation models were developed, including classic statistical, machine learning, and deep learning models. To that end, we based our architecture on the nixtlaverse approach to forecasting.
+- finding appropriate and novel data unseen by all models considered in the analysis. For the current results, we guaranteed that all the timestamps for all the time series were completely unseen to TimeGPT-1 during training, including the train set part of the time series commonly used for in-distribution evaluation. 
+
+Based on the datasets used for training reported in the papers of other foundation models:
+- We can't discard that `TimesFM` potentially observed during training the time series made available here from Monthly, Weekly, and Daily frequencies.
+- `Chronos` could have observed the train part of a small fraction of the time series (as reported in `Table 2 in-domain evaluation`) for all frequencies. 
+- Based on the data contained in `LOTSA`, `Moirai` could have observed the train part of a small fraction of the time series for Monthly, Weekly, and Daily frequencies.
+- `Lag-Llama` could have observed a small fraction of the time series for Monthly, Weekly, and Daily frequencies.
+
+Note: we updated the initial benchmark table to reduce the chance of leakage.
+
+Given these observations, we expect the current evaluation setting to at least favour other foundation models, as having observed the time series during training will likely lead to increased performance. 
+
+This also underscores a common concern among practitioners: for models like `TimeGPT-1`, where the exact training data remains undisclosed, evaluating the model using public datasets is problematic due to the potential for overfitting. We recognize this as a limitation of closed-source models; however, we appeal for the reader's understanding. As a small startup competing with the largest companies in the world, we have had to keep some of our methods confidential.
+
+That being said we celebrate the open-source nature of the other foundation models and we encourage the community to continue to push for more transparency in the field. We have been doing our part and will continue to do so in due time. 
+
+We would also like to conclude that it is essential for practitioners to test these models on their datasets and form their conclusions.
+
+
+## Preliminary Conclusions
+
+In addition to the success stories we have heard from our users, this benchmark indicates that other models are also ready to be used in production for time series forecasting and anomaly detection tasks. By no means are we claiming that `TimeGPT-1` is the best model for all tasks, but it is the most accurate and fastest model we have benchmarked so far. It also production ready and can be used and tested by anyone out of the box.
+
+That being said, it is well known among practitioners that before deploying any model into production, you should do some benchmarking. It is also known that there is no such thing as magic in time-series. And finally, it is also known that `Prophet` is not a good benchmarking model.
+
+## Special Conclusion for Hacker News readers
+
+* You can't forecast stocks with ANY of these models if you don't have additional data to include as covariates. Please stop [asking](https://news.ycombinator.com/item?id=37874891).
+
+# Acknowledgments
+
+We at Nixtla believe that all of our work is due to other great researchers that have paved the way. In that spirit, we would like to thank the more than 50 researchers from Google Cloud, AWS AI, AWS, UC Berkeley, NYU, CMU, Salesforce Research, Salesforce, McGill University, ServiceNow, Quebec AI Institute, Morgan Stanley, CERC AAI lab, MILA, Zalando who have been ardently working to improve the 
+field of foundation models in time series.
+
+
+# ToDo's [WIP]
+
+> This is a first attempt to provide a fully reproducible benchmark foundation. A lot of work still needs to be done:
+* Include probabilistic benchmarks
+* Explore the distribution of errors across MASE and other metrics
+* Experiment with different cross validation windows
+* Include more classical and deep learning models
+* Include more foundation models (Moment (CMU) and Tiny Time Mixers(IBM))
+* Benchmark anomaly detection
+* Include finetuning
+
+# Reproducibility
+
+To ensure the reproducibility of our findings, the experiments were conducted on an AWS c5a.24xlarge instance, equipped with 96 vCPUs and 192 GiB of RAM for non-cpu workloads. In contrast, the experiments for foundation and deep learning models were carried out on an AWS g5.4xlarge GPU instance, which includes 16 vCPUs, 64 GiB of RAM, and an NVIDIA A10G Tensor Core GPU with 24 GiB. All necessary code and detailed instructions for reproducing the experiments are available in this directory.
+
+## Get started with Nixtla's models on the public API
+
+It must be noted, that to use `TimeGPT-1` you will need an account on Nixtla platform to access our models and get a [30 days free trial](https://dashboard.nixtla.io/freetrial). You can also access `TimeGPT-1` through [Azure AI Studio](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-deploy-models-timegen-1?view=azureml-api-2) and Azure Machine Learning (named `TimeGEN-1`).
+
+## Instructions to reproduce the benchmark
+
+### Download data
+
+#### Configure aws cli
+
+The data lives in s3. You can download it using the `aws` cli. Follow the instructions [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) to install it.
+
+#### Download data from s3
+
+```
+make download_data
+```
+
+### Download extra code3
+
+```
+make download_lag_llama_code
+```
+
+### Download data
+
+```
+make download_data
+```
+
+### Create environment
+
+```
+mamba create -n foundation-ts python=3.10
+conda activate foundation-ts
+pip install uv
+uv pip install -r requirements.txt
+```
+
+### Run
+
+```
+python -m xiuhmolpilli.arena
+```
+
+### New Features
+
+Since the release, we received more than 15,000 requests from companies, organizations, and researchers, eager to try `TimeGPT-1`. We are thrilled for the overall feedback, as `TimeGPT-1` has proven valuable for hundreds of applications, from predicting river flows, to forecast demand sales of thousands of products.
+
+After several months of beta testing, we have new updates and improvements based on user feedback:
+
+- New Features: Introduction of `model="timegpt-1-long-horizon"` for improved long-term forecast accuracy. Enhanced uncertainty quantification for forecasting and anomaly detection. Advanced model fine-tuning with diverse loss functions (e.g., `"mae"`, `"mse"`, `"rmse"`, `"mape"`, `"smape"`). Support for distributed computing and big data (Spark, Ray, and Dask).
+- [Documentation Improvements](https://docs.nixtla.io/): Revamped layout, new tutorials, and Google Colab compatibility. Expanded documentation covers What-if scenarios, electricity and financial forecasting, and anomaly detection.
+- [New R SDK](https://github.com/Nixtla/nixtlar). Forecast using `TimeGPT-1` with R.
+
+### Partnering with Microsoft to provide our models on Azure
+
+At Nixtla, our mission is to make frontier AI ubiquitous. Last week we announce the availability of `TimeGEN-1` on Azure. Our models are now accessible through:
+
+- Public API: hosted safely on Nixtla infrastructure, this access point enables developers to create applications and services across our range of models. Create your account [here](https://dashboard.nixtla.io/)
+
+- [Azure](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-deploy-models-timegen-1?view=azureml-api-2): `TimeGEN-1` is available through Azure AI Studio and Azure Machine Learning, offering a seamless user experience comparable to our API. Beta customers have experienced significant success. 
+
+- Self-deployment: our models can also be deployed in your environment for the most sensitive use cases. Contact our team for further details. Please contact 
+
+
+# References
+- **TimeGPT-1**: [TimeGPT-1](https://arxiv.org/abs/2310.03589b)
+- **Chronos**: [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815)
+- **TimesFM**: [A decoder-only foundation model for time-series forecasting](https://arxiv.org/pdf/2310.10688)
+- **Moirai**: [Unified Training of Universal Time Series Forecasting Transformers](https://arxiv.org/abs/2402.02592)
+- **Lag-LLama**: [Lag-Llama: Towards Foundation Models for Probabilistic Time Series Forecasting](https://arxiv.org/pdf/2310.08278)
--- a/experiments/foundation-time-series-arena/requirements.txt
+++ b/experiments/foundation-time-series-arena/requirements.txt
+gluonts[torch]
+numpy
+torch>=2.0.0
+wandb
+scipy
+pandas
+huggingface_hub[cli]
+einshape
+fire
+nixtla
+python-dotenv
+rich
+statsforecast
+neuralforecast
+utilsforecast
+mlforecast
+lightgbm
+chronos @ git+https://github.com/amazon-science/chronos-forecasting.git
+salesforce-uni2ts @ git+https://github.com/SalesforceAIResearch/uni2ts.git
+timesfm @ git+https://github.com/AzulGarza/timesfm.git@fix-structure
+jax[cuda12]
+pytest
+prophet
--- a/experiments/foundation-time-series-arena/tests/__init__.py
+++ b/experiments/foundation-time-series-arena/tests/__init__.py
--- a/experiments/foundation-time-series-arena/tests/test_arena.py
+++ b/experiments/foundation-time-series-arena/tests/test_arena.py
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import pandas as pd
+
+from xiuhmolpilli.arena import FoundationalTimeSeriesArena
+from .utils import models
+from .test_eval import generate_exp_dataset
+
+
+def generate_data(freq: str, tmpdir: str) -> str:
+    df = generate_exp_dataset(n_series=5, freq=freq, return_df=True)
+    df_parquet_path = Path(tmpdir) / f"dataset_{freq}.parquet"
+    df.to_parquet(df_parquet_path)
+    return str(df_parquet_path)
+
+
+def test_foundational_time_series_arena():
+    cwd = Path.cwd()
+    with TemporaryDirectory(dir=cwd) as tmpdir:
+        parquet_data_paths = [generate_data(freq, tmpdir) for freq in ["H", "MS"]]
+        arena = FoundationalTimeSeriesArena(
+            models=models,
+            parquet_data_paths=parquet_data_paths,
+            results_dir=tmpdir,
+        )
+        arena.compete()
+        eval_df = pd.read_csv(arena.evaluation_path)
+        arena.compete()
+        eval_df_2 = pd.read_csv(arena.evaluation_path)
+        print(eval_df)
+        print(eval_df_2)
+        assert eval_df.equals(eval_df_2)
+        print(eval_df)
--- a/experiments/foundation-time-series-arena/tests/test_eval.py
+++ b/experiments/foundation-time-series-arena/tests/test_eval.py
--- a/experiments/foundation-time-series-arena/tests/test_models.py
+++ b/experiments/foundation-time-series-arena/tests/test_models.py
+import pandas as pd
+import pytest
+from utilsforecast.data import generate_series
+
+from .utils import models
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("freq", ["H", "D", "W-MON", "MS"])
+@pytest.mark.parametrize("h", [1, 12])
+def test_correct_forecast_dates(model, freq, h):
+    n_series = 5
+    df = generate_series(
+        n_series,
+        freq=freq,
+    )
+    df["unique_id"] = df["unique_id"].astype(str)
+    df_test = df.groupby("unique_id").tail(h)
+    df_train = df.drop(df_test.index)
+    fcst_df = model.forecast(
+        df_train,
+        h=h,
+        freq=freq,
+    )
+    exp_n_cols = 3
+    assert fcst_df.shape == (n_series * h, exp_n_cols)
+    exp_cols = ["unique_id", "ds"]
+    pd.testing.assert_frame_equal(
+        fcst_df[exp_cols].sort_values(["unique_id", "ds"]).reset_index(drop=True),
+        df_test[exp_cols].sort_values(["unique_id", "ds"]).reset_index(drop=True),
+    )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("freq", ["H", "D", "W-MON", "MS"])
+@pytest.mark.parametrize("n_windows", [1, 4])
+def test_cross_validation(model, freq, n_windows):
+    h = 12
+    n_series = 5
+    df = generate_series(n_series, freq=freq, equal_ends=True)
+    df["unique_id"] = df["unique_id"].astype(str)
+    cv_df = model.cross_validation(
+        df,
+        h=h,
+        freq=freq,
+        n_windows=n_windows,
+    )
+    exp_n_cols = 5  # unique_id, cutoff, ds, y, model
+    assert cv_df.shape == (n_series * h * n_windows, exp_n_cols)
+    cutoffs = cv_df["cutoff"].unique()
+    assert len(cutoffs) == n_windows
+    df_test = df.groupby("unique_id").tail(h * n_windows)
+    exp_cols = ["unique_id", "ds", "y"]
+    pd.testing.assert_frame_equal(
+        cv_df.sort_values(["unique_id", "ds"]).reset_index(drop=True)[exp_cols],
+        df_test.sort_values(["unique_id", "ds"]).reset_index(drop=True)[exp_cols],
+    )
+    if n_windows == 1:
+        # test same results using predict with less data
+        df_test = df.groupby("unique_id").tail(h)
+        df_train = df.drop(df_test.index)
+        fcst_df = model.forecast(
+            df_train,
+            h=h,
+            freq=freq,
+        )
+        exp_cols = ["unique_id", "ds"]
+        pd.testing.assert_frame_equal(
+            cv_df.sort_values(["unique_id", "ds"]).reset_index(drop=True)[exp_cols],
+            fcst_df.sort_values(["unique_id", "ds"]).reset_index(drop=True)[exp_cols],
+        )
--- a/experiments/foundation-time-series-arena/tests/utils.py
+++ b/experiments/foundation-time-series-arena/tests/utils.py
+from xiuhmolpilli.models.benchmarks import (
+    AutoARIMA,
+    NixtlaProphet,
+    SeasonalNaive,
+    AutoNHITS,
+    AutoTFT,
+    AutoLGBM,
+)
+from xiuhmolpilli.models.foundational import Chronos, LagLlama, Moirai, TimeGPT, TimesFM
+
+models = [
+    # benchmarks
+    AutoARIMA(),
+    NixtlaProphet(),
+    SeasonalNaive(),
+    # neural benchmarks
+    AutoNHITS(),
+    AutoTFT(),
+    # ml
+    AutoLGBM(),
+    # foundational models
+    Chronos("amazon/chronos-t5-tiny"),
+    LagLlama(),
+    Moirai("Salesforce/moirai-1.0-R-small"),
+    TimeGPT(),
+    TimesFM(),
+]
--- a/experiments/foundation-time-series-arena/xiuhmolpilli/__init__.py
+++ b/experiments/foundation-time-series-arena/xiuhmolpilli/__init__.py
--- a/experiments/foundation-time-series-arena/xiuhmolpilli/arena.py
+++ b/experiments/foundation-time-series-arena/xiuhmolpilli/arena.py
--- a/experiments/foundation-time-series-arena/xiuhmolpilli/models/__init__.py
+++ b/experiments/foundation-time-series-arena/xiuhmolpilli/models/__init__.py
--- a/experiments/foundation-time-series-arena/xiuhmolpilli/models/benchmarks/__init__.py
+++ b/experiments/foundation-time-series-arena/xiuhmolpilli/models/benchmarks/__init__.py
+from .ml import AutoLGBM
+from .neural import (
+    AutoNHITS,
+    AutoTFT,
+)
+from .prophet import NixtlaProphet
+from .stats import (
+    ADIDA,
+    AutoARIMA,
+    AutoCES,
+    AutoETS,
+    CrostonClassic,
+    DOTheta,
+    HistoricAverage,
+    IMAPA,
+    SeasonalNaive,
+    Theta,
+    ZeroModel,
+)
+
+__all__ = [
+    "AutoLGBM",
+    "NixtlaProphet",
+    "AutoNHITS",
+    "AutoTFT",
+    "ADIDA",
+    "AutoARIMA",
+    "AutoCES",
+    "AutoETS",
+    "CrostonClassic",
+    "DOTheta",
+    "HistoricAverage",
+    "IMAPA",
+    "SeasonalNaive",
+    "Theta",
+    "ZeroModel",
+]
--- a/experiments/foundation-time-series-arena/xiuhmolpilli/models/benchmarks/ml.py
+++ b/experiments/foundation-time-series-arena/xiuhmolpilli/models/benchmarks/ml.py