readme

f42429f6 · bailuo · f42429f6 · f42429f6 · f42429f6 · f42429f6
Commit f42429f6 authored Nov 19, 2025 by bailuo
20 changed files
--- a/docs/mintlify/mint.json
+++ b/docs/mintlify/mint.json
+{
+  "$schema": "https://mintlify.com/schema.json",
+  "name": "Nixtla",
+  "logo": {
+    "light": "/light.png",
+    "dark": "/dark.png"
+  },
+  "favicon": "/favicon.svg",
+  "colors": {
+    "primary": "#0E0E0E",
+    "light": "#FAFAFA",
+    "dark": "#0E0E0E",
+    "anchors": {
+      "from": "#2AD0CA",
+      "to": "#0E00F8"
+    }
+  },
+  "topbarCtaButton": {
+    "type": "github",
+    "url": "https://github.com/Nixtla/nixtla"
+  },
+  "navigation": [
+    {
+      "group": "Getting Started",
+      "pages": [
+        "docs/getting-started/1_introduction",
+        "docs/getting-started/2_quickstart",
+        "docs/getting-started/21_polars_quickstart",
+        "docs/getting-started/22_azure_quickstart",
+        "docs/getting-started/3_setting_up_your_api_key",
+        "docs/getting-started/4_data_requirements",
+        "docs/getting-started/41_pricing",
+        "docs/getting-started/5_faq",
+        "docs/getting-started/6_glossary",
+        "docs/getting-started/7_why_timegpt"
+      ]
+    },
+    {
+      "group": "Capabilities",
+      "pages": [
+        {
+          "group": "Forecast",
+          "pages": [
+            "docs/capabilities/forecast/01_quickstart",
+            "docs/capabilities/forecast/02_exogenous_variables",
+            "docs/capabilities/forecast/03_holidays_special_dates",
+            "docs/capabilities/forecast/04_categorical_variables",
+            "docs/capabilities/forecast/05_longhorizon",
+            "docs/capabilities/forecast/06_multiple_series",
+            "docs/capabilities/forecast/07_finetuning",
+            "docs/capabilities/forecast/08_custom_loss_function",
+            "docs/capabilities/forecast/09_cross_validation",
+            "docs/capabilities/forecast/10_prediction_intervals",
+            "docs/capabilities/forecast/11_irregular_timestamps"
+          ]
+        },
+        {
+          "group": "Historical Anomaly Detection",
+          "pages": [
+            "docs/capabilities/historical-anomaly-detection/01_quickstart",
+            "docs/capabilities/historical-anomaly-detection/02_anomaly_exogenous",
+            "docs/capabilities/historical-anomaly-detection/03_anomaly_detection_date_features",
+            "docs/capabilities/historical-anomaly-detection/04_confidence_levels"
+          ]
+        },
+        {
+          "group": "Online Anomaly Detection",
+          "pages": [
+            "docs/capabilities/online-anomaly-detection/01_quickstart",
+            "docs/capabilities/online-anomaly-detection/02_adjusting_detection_process",
+            "docs/capabilities/online-anomaly-detection/03_univariate_vs_multivariate_anomaly_detection"
+          ]
+        }
+      ]
+    },
+    {
+      "group": "Deployment",
+      "pages": [
+        "docs/deployment/2_azure_ai"
+      ]
+    },
+    {
+      "group": "Tutorials",
+      "pages": [
+        "docs/tutorials/20_anomaly_detection",
+        {
+          "group": "Exogenous variables",
+          "pages": [
+            "docs/tutorials/01_exogenous_variables",
+            "docs/tutorials/02_holidays",
+            "docs/tutorials/03_categorical_variables",
+            "docs/tutorials/21_shap_values"
+          ]
+        },
+        {
+          "group": "Training",
+          "pages": [
+            "docs/tutorials/04_longhorizon",
+            "docs/tutorials/05_multiple_series"
+          ]
+        },
+        {
+          "group": "Fine-tuning",
+          "pages": [
+            "docs/tutorials/06_finetuning",
+            "docs/tutorials/061_reusing_finetuned_models",
+            "docs/tutorials/07_loss_function_finetuning",
+            "docs/tutorials/23_finetune_depth_finetuning"
+          ]
+        },
+        {
+          "group": "Validation",
+          "pages": [
+            "docs/tutorials/08_cross_validation",
+            "docs/tutorials/09_historical_forecast"
+          ]
+        },
+        {
+          "group": "Uncertainty quantification",
+          "pages": [
+            "docs/tutorials/10_uncertainty_quantification_with_quantile_forecasts",
+            "docs/tutorials/11_uncertainty_quantification_with_prediction_intervals"
+          ]
+        },
+        {
+          "group": "Special Topics",
+          "pages": [
+            "docs/tutorials/13_bounded_forecasts",
+            "docs/tutorials/14_hierarchical_forecasting",
+            "docs/tutorials/23_temporalhierarchical",
+            "docs/tutorials/15_missing_values",
+            "docs/tutorials/22_how_to_improve_forecast_accuracy"
+          ]
+        },
+        {
+          "group": "Computing at scale",
+          "pages": [
+            "docs/tutorials/16_computing_at_scale",
+            "docs/tutorials/17_computing_at_scale_spark_distributed",
+            "docs/tutorials/18_computing_at_scale_dask_distributed",
+            "docs/tutorials/19_computing_at_scale_ray_distributed"
+          ]
+        }
+      ]
+    },
+    {
+      "group": "Use cases",
+      "pages": [
+        "docs/use-cases/1_forecasting_web_traffic",
+        "docs/use-cases/2_bitcoin_price_prediction",
+        "docs/use-cases/3_electricity_demand",
+        "docs/use-cases/4_intermittent_demand",
+        "docs/use-cases/5_what_if_pricing_scenarios_in_retail"
+      ]
+    },
+    {
+      "group": "API Reference",
+      "pages": [
+        "nixtla_client",
+        "date_features",
+        "docs/reference/03_excel_addin",
+        "docs/reference/04_nixtlar"
+      ]
+    }
+  ]
+}
\ No newline at end of file
--- a/docs/to_mdx.py
+++ b/docs/to_mdx.py
+import re
+from pathlib import Path
+
+comment_pat = re.compile(r"<!--.*?-->", re.DOTALL)
+anchor_pat = re.compile(r"<a.*?>(.*?)</a>")
+output_path = Path("docs/mintlify")
+
+# process docs
+for file in Path("docs").glob("*.md"):
+    text = file.read_text()
+    text = comment_pat.sub("", text)
+    text = anchor_pat.sub("", text)
+    module_name = file.name.split(".")[-2]
+    output_file = output_path / (module_name + ".mdx")
+    output_file.write_text(text)
+
+
+readme_text = Path("README.md").read_text()
+readme_text = readme_text
+(output_path / "index.mdx").write_text(readme_text)
--- a/experiments/amazon-chronos/README.md
+++ b/experiments/amazon-chronos/README.md
+# Amazon Chronos is 10% less accurate and 500% slower than training classical statistical models.
+
+We present a fully reproducible comprehensive evaluation showcasing that a Statistical Ensemble, consisting of AutoARIMA, AutoETS, AutoCES, and DynamicOptimizedTheta, outperforms Amazon Chronos—a foundational model for time series forecasting with over 710 million parameters. Specifically, the **Statistical Ensemble demonstrates 10%, 10%, and 11% superior performance in CRPS, MASE, and SMAPE metrics, respectively**, and it is **5x faster**. This analysis spans over **50,000 unique time series** across M1, M3, M4, and Tourism datasets, robustly comparing these models.
+
+# Introduction
+
+The rise of foundational models in time series forecasting, such as Amazon Chronos, represents a significant leap forward, leveraging deep learning and massive datasets for model pre-training to enhance predictive accuracy. Amazon Chronos, in particular, is noteworthy for its extensive parameterization and ambitious scope. However, our study shows that a comparatively simpler approach, employing a Statistical Ensemble of traditional forecasting methods, yields better accuracy and computational efficiency. One year ago, we used the same [benchmark](https://github.com/Nixtla/statsforecast/tree/main/experiments/m3) to showcase how statistical models outperformed deep learning models. 
+
+## Empirical Evaluation
+
+This study considers over 50,000 unique time series from the M1, M3, M4, and Tourism datasets, spanning various time series frequencies. Chronos did not use these datasets in the training phase. We have also included comparisons to the Seasonal Naive model to provide a benchmark for traditional forecasting methods.
+
+## Results
+
+Our findings are shown in the following table, showcasing the performance across different metrics: CRPS, MASE, SMAPE, and computational time (in seconds). The best results are highlighted in **bold** for ease of reference.
+
+<img width="1099" alt="image" src="https://github.com/Nixtla/nixtla/assets/10517170/4d4fe9f3-4251-4b95-bd9b-248fc283e97b">
+
+
+## Reproducibility
+
+To ensure the reproducibility of our findings, the Statistical Ensemble experiments were conducted on an AWS c5a.24xlarge instance, equipped with 96 vCPUs and 192 GiB of RAM. In contrast, the experiments for Amazon Chronos were carried out on an AWS g5.4xlarge GPU instance, which includes 16 vCPUs, 64 GiB of RAM, and an NVIDIA A10G Tensor Core GPU with 24 GiB. All necessary code and detailed instructions for reproducing the experiments are available in this directory.
+
+### Instructions
+
+1. Set up a Python environment:
+   
+```bash
+mamba env create -f environment.yml
+conda activate amazon-chronos
+```
+
+2. Run the experiments as reported in the table:
+   
+```bash
+python -m src.main --mode fcst_statsforecast
+python -m src.main --mode fcst_chronos
+```
+
+3. Evaluate the results using:
+
+```bash
+python -m src.main --mode evaluation
+```
+
+### References
+- **Statistical Ensemble Paper**: [A Simple Combination of Univariate Models](https://www.sciencedirect.com/science/article/abs/pii/S0169207019300585?via%3Dihub)
+- **Amazon Chronos Paper**: [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815)
--- a/experiments/amazon-chronos/environment.yml
+++ b/experiments/amazon-chronos/environment.yml
+name: amazon-chronos
+channels:
+  - conda-forge
+  - defaults
+  - anaconda
+dependencies:
+  - jupyterlab
+  - pip
+  - python=3.10
+  - pip:
+    - datasetsforecast
+    - fire
+    - gluonts
+    - huggingface_hub[cli]
+    - neuralforecast
+    - orjson
+    - statsforecast
+    - utilsforecast
+    - git+https://github.com/amazon-science/chronos-forecasting.git
+
--- a/experiments/amazon-chronos/src/amazon_chronos/forecaster.py
+++ b/experiments/amazon-chronos/src/amazon_chronos/forecaster.py
+import logging
+from typing import Iterable, List
+
+import numpy as np
+import pandas as pd
+import torch
+from chronos import ChronosPipeline
+from utilsforecast.processing import make_future_dataframe
+
+logging.basicConfig(level=logging.INFO)
+main_logger = logging.getLogger(__name__)
+
+
+class TimeSeriesDataset:
+    def __init__(
+        self,
+        data: torch.Tensor,
+        uids: Iterable,
+        last_times: Iterable,
+        batch_size: int,
+    ):
+        self.data = data
+        self.uids = uids
+        self.last_times = last_times
+        self.batch_size = batch_size
+        self.n_batches = len(data) // self.batch_size + (
+            0 if len(data) % self.batch_size == 0 else 1
+        )
+        self.current_batch = 0
+
+    @classmethod
+    def from_df(cls, df: pd.DataFrame, batch_size: int):
+        num_unique_ids = df["unique_id"].nunique()
+        max_series_length = df["unique_id"].value_counts().max()
+        padded_tensor = torch.full(
+            size=(num_unique_ids, max_series_length),
+            fill_value=torch.nan,
+            dtype=torch.bfloat16,
+        )  # type: ignore
+        df_sorted = df.sort_values(by=["unique_id", "ds"])
+        for idx, (_, group) in enumerate(df_sorted.groupby("unique_id")):
+            series_length = len(group)
+            padded_tensor[idx, -series_length:] = torch.tensor(
+                group["y"].values,
+                dtype=torch.bfloat16,
+            )
+        uids = df_sorted["unique_id"].unique()
+        last_times = df_sorted.groupby("unique_id")["ds"].tail(1)
+        return cls(padded_tensor, uids, last_times, batch_size)
+
+    def __len__(self):
+        return len(self.data)
+
+    def make_future_dataframe(self, h: int, freq: str) -> pd.DataFrame:
+        return make_future_dataframe(
+            uids=self.uids,
+            last_times=pd.to_datetime(self.last_times),
+            h=h,
+            freq=freq,
+        )  # type: ignore
+
+    def __iter__(self):
+        self.current_batch = 0  # Reset for new iteration
+        return self
+
+    def __next__(self):
+        if self.current_batch < self.n_batches:
+            start_idx = self.current_batch * self.batch_size
+            end_idx = start_idx + self.batch_size
+            self.current_batch += 1
+            return self.data[start_idx:end_idx]
+        else:
+            raise StopIteration
+
+
+class AmazonChronos:
+    def __init__(self, model_name: str):
+        self.model_name = model_name
+        self.model = ChronosPipeline.from_pretrained(
+            model_name,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+        )
+
+    def forecast(
+        self,
+        df: pd.DataFrame,
+        h: int,
+        freq: str,
+        batch_size: int = 32,
+        quantiles: List[float] | None = None,
+        **predict_kwargs,
+    ) -> pd.DataFrame:
+        main_logger.info("transforming dataframe to tensor")
+        dataset = TimeSeriesDataset.from_df(df, batch_size=batch_size)
+        main_logger.info("forecasting")
+        fcsts = [self.model.predict(batch, prediction_length=h, **predict_kwargs) for batch in dataset]
+        fcst = torch.cat(fcsts)
+        main_logger.info("transforming forecast to dataframe")
+        fcst = fcst.numpy()
+        fcst_df = dataset.make_future_dataframe(h=h, freq=freq)
+        fcst_df[self.model_name] = np.median(fcst, axis=1).reshape(-1, 1)
+        if quantiles is not None:
+            for q in quantiles:
+                q_col = f"{self.model_name}-q-{q}"
+                fcst_df[q_col] = np.quantile(fcst, q, axis=1).reshape(-1, 1)
+        return fcst_df
+
+
+if __name__ == "__main__":
+    import pandas as pd
+
+    df = pd.read_csv(
+        "https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/AirPassengers.csv"
+    )
+    df = df.rename(columns={"#Passengers": "y", "Month": "ds"})
+    df["ds"] = pd.to_datetime(df["ds"])
+    df.insert(0, "unique_id", "AirPassengers")
+    df = pd.concat([df, df.assign(unique_id="AirPassengers2")])
+    model = AmazonChronos("amazon/chronos-t5-small")
+    fcst_df = model.forecast(df, h=12, freq="MS")
+    print(fcst_df)
--- a/experiments/amazon-chronos/src/amazon_chronos/pipeline.py
+++ b/experiments/amazon-chronos/src/amazon_chronos/pipeline.py
+import os
+from time import time
+from typing import List, Tuple
+
+import fire
+import pandas as pd
+
+
+from ..utils import ExperimentHandler
+from .forecaster import AmazonChronos
+
+
+def run_amazon_chronos(
+    train_df: pd.DataFrame,
+    model_name: str,
+    horizon: int,
+    freq: str,
+    quantiles: List[float],
+) -> Tuple[pd.DataFrame, float, str]:
+    ac = AmazonChronos(model_name)
+    init_time = time()
+    fcsts_df = ac.forecast(
+        df=train_df,
+        h=horizon,
+        freq=freq,
+        batch_size=8,
+        quantiles=quantiles,
+        # parameters as in https://github.com/amazon-science/chronos-forecasting/blob/73be25042f5f587823d46106d372ba133152fb00/README.md?plain=1#L62-L65
+        num_samples=20,
+        temperature=1.0,
+        top_k=50,
+        top_p=1.0,
+    )
+    total_time = time() - init_time
+    return fcsts_df, total_time, model_name
+
+
+def main(dataset: str, model_name: str):
+    exp = ExperimentHandler(dataset)
+    fcst_df, total_time, model_name = run_amazon_chronos(
+        train_df=exp.train_df,
+        model_name=model_name,
+        horizon=exp.horizon,
+        freq=exp.freq,
+        quantiles=exp.quantiles,
+    )
+    exp.save_results(fcst_df, total_time, model_name)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/experiments/amazon-chronos/src/main.py
+++ b/experiments/amazon-chronos/src/main.py
+import logging
+import subprocess
+
+import fire
+import pandas as pd
+
+from src.utils import ExperimentHandler
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+datasets = [
+    "m1_yearly",
+    "m1_quarterly",
+    "m1_monthly",
+    "m3_yearly",
+    "m3_quarterly",
+    "m3_monthly",
+    "m3_other",
+    "tourism_yearly",
+    "tourism_quarterly",
+    "tourism_monthly",
+    "m4_yearly",
+    "m4_quarterly",
+]
+
+amazon_chronos_models = [
+    "amazon/chronos-t5-large",
+    "amazon/chronos-t5-tiny",
+    "amazon/chronos-t5-mini",
+    "amazon/chronos-t5-small",
+    "amazon/chronos-t5-base",
+]
+
+
+def main(mode: str):
+    prefix_process = ["python", "-m"]
+
+    eval_df = None
+    for dataset in datasets:
+        logger.info(f"Evaluating {dataset}...")
+        if mode in ["fcst_statsforecast", "fcst_chronos"]:
+            suffix_process = ["--dataset", dataset]
+
+            def process(middle_process):
+                return prefix_process + middle_process + suffix_process
+
+            if mode == "fcst_statsforecast":
+                logger.info("Running StatisticalEnsemble")
+                subprocess.run(process(["src.statsforecast_pipeline"]))
+            elif mode == "fcst_chronos":
+                for model in amazon_chronos_models:
+                    logger.info(f"Running Amazon Chronos {model}")
+                    chronos_process = process(["src.amazon_chronos.pipeline"])
+                    chronos_process.extend(["--model_name", model])
+                    subprocess.run(chronos_process)
+        elif mode == "evaluation":
+            if eval_df is None:
+                eval_df = []
+            logger.info("Running dataset evaluation")
+            exp = ExperimentHandler(dataset)
+            try:
+                eval_dataset_df = exp.evaluate_models(
+                    amazon_chronos_models + ["StatisticalEnsemble", "SeasonalNaive"]
+                )
+                print(eval_dataset_df)
+                eval_df.append(eval_dataset_df)
+            except Exception as e:
+                logger.error(e)
+    if eval_df is not None:
+        eval_df = pd.concat(eval_df).reset_index(drop=True)
+        exp.save_dataframe(eval_df, "complete-results.csv")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/experiments/amazon-chronos/src/statsforecast_pipeline.py
+++ b/experiments/amazon-chronos/src/statsforecast_pipeline.py
+import os
+from time import time
+from typing import List, Tuple
+
+os.environ["NIXTLA_NUMBA_RELEASE_GIL"] = "1"
+os.environ["NIXTLA_NUMBA_CACHE"] = "1"
+
+import fire
+import numpy as np
+import pandas as pd
+from scipy.stats import norm
+from statsforecast import StatsForecast
+from statsforecast.models import (
+    AutoARIMA,
+    AutoETS,
+    AutoCES,
+    DynamicOptimizedTheta,
+    SeasonalNaive,
+)
+
+from src.utils import ExperimentHandler
+
+
+def run_seasonal_naive(
+    train_df: pd.DataFrame,
+    horizon: int,
+    freq: str,
+    seasonality: int,
+    level: List[int],
+) -> Tuple[pd.DataFrame, float, str]:
+    os.environ["NIXTLA_ID_AS_COL"] = "true"
+    sf = StatsForecast(
+        models=[SeasonalNaive(season_length=seasonality)],
+        freq=freq,
+        n_jobs=-1,
+    )
+    model = sf
+    init_time = time()
+    fcsts_df = model.forecast(df=train_df, h=horizon, level=level)
+    total_time = time() - init_time
+    return fcsts_df, total_time, "SeasonalNaive"
+
+
+def ensemble_forecasts(
+    fcsts_df: pd.DataFrame,
+    quantiles: List[float],
+    name_models: List[str],
+    model_name: str,
+) -> pd.DataFrame:
+    fcsts_df[model_name] = fcsts_df[name_models].mean(axis=1).values  # type: ignore
+    # compute quantiles based on the mean of the forecasts
+    sigma_models = []
+    for model in name_models:
+        fcsts_df[f"sigma_{model}"] = fcsts_df[f"{model}-hi-68.27"] - fcsts_df[model]
+        sigma_models.append(f"sigma_{model}")
+    fcsts_df[f"std_{model_name}"] = (
+        fcsts_df[sigma_models].pow(2).sum(axis=1).div(len(sigma_models) ** 2).pow(0.5)
+    )
+    z = norm.ppf(quantiles)
+    q_cols = []
+    for q, zq in zip(quantiles, z):
+        q_col = f"{model_name}-q-{q}"
+        fcsts_df[q_col] = fcsts_df[model_name] + zq * fcsts_df[f"std_{model_name}"]
+        q_cols.append(q_col)
+    fcsts_df = fcsts_df[["unique_id", "ds"] + [model_name] + q_cols]
+    return fcsts_df
+
+
+def run_statistical_ensemble(
+    train_df: pd.DataFrame,
+    horizon: int,
+    freq: str,
+    seasonality: int,
+    quantiles: List[float],
+) -> Tuple[pd.DataFrame, float, str]:
+    os.environ["NIXTLA_ID_AS_COL"] = "true"
+    models = [
+        AutoARIMA(season_length=seasonality),
+        AutoETS(season_length=seasonality),
+        AutoCES(season_length=seasonality),
+        DynamicOptimizedTheta(season_length=seasonality),
+    ]
+    init_time = time()
+    series_per_core = 15
+    n_series = train_df["unique_id"].nunique()
+    n_jobs = min(n_series // series_per_core, os.cpu_count())
+    sf = StatsForecast(
+        models=models,
+        freq=freq,
+        n_jobs=n_jobs,
+    )
+    fcsts_df = sf.forecast(df=train_df, h=horizon, level=[68.27])
+    name_models = [repr(model) for model in models]
+    model_name = "StatisticalEnsemble"
+    fcsts_df = ensemble_forecasts(
+        fcsts_df,
+        quantiles,
+        name_models,
+        model_name,
+    )
+    total_time = time() - init_time
+    return fcsts_df, total_time, model_name
+
+
+def main(dataset: str):
+    exp = ExperimentHandler(dataset)
+    # seasonal naive benchmark
+    fcst_df, total_time, model_name = run_seasonal_naive(
+        train_df=exp.train_df,
+        horizon=exp.horizon,
+        freq=exp.freq,
+        seasonality=exp.seasonality,
+        level=exp.level,
+    )
+    fcst_df = exp.fcst_from_level_to_quantiles(fcst_df, model_name)
+    exp.save_results(fcst_df, total_time, model_name)
+    # statistical ensemble
+    fcst_df, total_time, model_name = run_statistical_ensemble(
+        train_df=exp.train_df,
+        horizon=exp.horizon,
+        freq=exp.freq,
+        seasonality=exp.seasonality,
+        quantiles=exp.quantiles,
+    )
+    exp.save_results(fcst_df, total_time, model_name)
+
+
+if __name__ == "__main__":
+    from statsforecast.utils import AirPassengers as ap
+
+    AutoARIMA(season_length=12).forecast(ap.astype(np.float32), h=12)
+    fire.Fire(main)
--- a/experiments/amazon-chronos/src/utils.py
+++ b/experiments/amazon-chronos/src/utils.py
+from functools import partial
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import pandas as pd
+from gluonts.dataset import Dataset
+from gluonts.dataset.repository.datasets import (
+    get_dataset,
+    dataset_names as gluonts_datasets,
+)
+from gluonts.time_feature.seasonality import get_seasonality
+from utilsforecast.evaluation import evaluate
+from utilsforecast.losses import mase, smape
+
+
+def quantile_loss(
+    df: pd.DataFrame,
+    models: list,
+    q: float = 0.5,
+    id_col: str = "unique_id",
+    target_col: str = "y",
+) -> pd.DataFrame:
+    delta_y = df[models].sub(df[target_col], axis=0)
+    res = (
+        np.maximum(q * delta_y, (q - 1) * delta_y)
+        .groupby(df[id_col], observed=True)
+        .mean()
+    )
+    res.index.name = id_col
+    res = res.reset_index()
+    return res
+
+
+class ExperimentHandler:
+    def __init__(
+        self,
+        dataset: str,
+        quantiles: List[float] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
+        results_dir: str = "./results",
+        models_dir: str = "./models",
+    ):
+        if dataset not in gluonts_datasets:
+            raise Exception(
+                f"dataset {dataset} not found in gluonts "
+                f"available datasets: {', '.join(gluonts_datasets)}"
+            )
+        self.dataset = dataset
+        self.quantiles = quantiles
+        self.level = self._transform_quantiles_to_levels(quantiles)
+        self.results_dir = results_dir
+        self.models_dir = models_dir
+        # defining datasets
+        self._maybe_download_m3_file(self.dataset)
+        gluonts_dataset = get_dataset(self.dataset)
+        self.horizon = gluonts_dataset.metadata.prediction_length
+        if self.horizon is None:
+            raise Exception(
+                f"horizon not found for dataset {self.dataset} "
+                "experiment cannot be run"
+            )
+        self.freq = gluonts_dataset.metadata.freq
+        self.seasonality = get_seasonality(self.freq)
+        self.gluonts_train_dataset = gluonts_dataset.train
+        self.gluonts_test_dataset = gluonts_dataset.test
+        self._create_dir_if_not_exists(self.results_dir)
+
+    @staticmethod
+    def _maybe_download_m3_file(dataset: str):
+        if dataset[:2] == "m3":
+            m3_file = Path.home() / ".gluonts" / "datasets" / "M3C.xls"
+            if not m3_file.exists():
+                from datasetsforecast.m3 import M3
+                from datasetsforecast.utils import download_file
+
+                download_file(m3_file.parent, M3.source_url)
+
+    @staticmethod
+    def _transform_quantiles_to_levels(quantiles: List[float]) -> List[int]:
+        level = [
+            int(100 - 200 * q) for q in quantiles if q < 0.5
+        ]  # in this case mean=mediain
+        level = sorted(list(set(level)))
+        return level
+
+    @staticmethod
+    def _create_dir_if_not_exists(directory: str):
+        Path(directory).mkdir(parents=True, exist_ok=True)
+
+    @staticmethod
+    def _transform_gluonts_instance_to_df(
+        ts: dict,
+        last_n: int | None = None,
+    ) -> pd.DataFrame:
+        start_period = ts["start"]
+        start_ds, freq = start_period.to_timestamp(), start_period.freq
+        target = ts["target"]
+        ds = pd.date_range(start=start_ds, freq=freq, periods=len(target))
+        if last_n is not None:
+            target = target[-last_n:]
+            ds = ds[-last_n:]
+        ts_df = pd.DataFrame({"unique_id": ts["item_id"], "ds": ds, "y": target})
+        return ts_df
+
+    @staticmethod
+    def _transform_gluonts_dataset_to_df(
+        gluonts_dataset: Dataset,
+        last_n: int | None = None,
+    ) -> pd.DataFrame:
+        df = pd.concat(
+            [
+                ExperimentHandler._transform_gluonts_instance_to_df(ts, last_n=last_n)
+                for ts in gluonts_dataset
+            ]
+        )
+        df = df.reset_index(drop=True)
+        return df
+
+    @property
+    def train_df(self) -> pd.DataFrame:
+        train_df = self._transform_gluonts_dataset_to_df(self.gluonts_train_dataset)
+        return train_df
+
+    @property
+    def test_df(self) -> pd.DataFrame:
+        test_df = self._transform_gluonts_dataset_to_df(
+            self.gluonts_test_dataset,
+            last_n=self.horizon,
+        )
+        return test_df
+
+    def save_dataframe(self, df: pd.DataFrame, file_name: str):
+        df.to_csv(f"{self.results_dir}/{file_name}", index=False)
+
+    def save_results(self, fcst_df: pd.DataFrame, total_time: float, model_name: str):
+        self.save_dataframe(
+            fcst_df,
+            f"{model_name}-{self.dataset}-fcst.csv",
+        )
+        time_df = pd.DataFrame({"time": [total_time], "model": model_name})
+        self.save_dataframe(
+            time_df,
+            f"{model_name}-{self.dataset}-time.csv",
+        )
+
+    def fcst_from_level_to_quantiles(
+        self,
+        fcst_df: pd.DataFrame,
+        model_name: str,
+    ) -> pd.DataFrame:
+        fcst_df = fcst_df.copy()
+        cols = ["unique_id", "ds", model_name]
+        for q in self.quantiles:
+            if q == 0.5:
+                col = f"{model_name}"
+            else:
+                lv = int(100 - 200 * q)
+                hi_or_lo = "lo" if lv > 0 else "hi"
+                lv = abs(lv)
+                col = f"{model_name}-{hi_or_lo}-{lv}"
+            q_col = f"{model_name}-q-{q}"
+            fcst_df[q_col] = fcst_df[col].values
+            cols.append(q_col)
+        return fcst_df[cols]
+
+    def evaluate_models(self, models: List[str]) -> pd.DataFrame:
+        test_df = self.test_df
+        train_df = self.train_df
+        fcsts_df = []
+        times_df = []
+        for model in models:
+            fcst_method_df = pd.read_csv(
+                f"{self.results_dir}/{model}-{self.dataset}-fcst.csv"
+            ).set_index(["unique_id", "ds"])
+            fcsts_df.append(fcst_method_df)
+            time_method_df = pd.read_csv(
+                f"{self.results_dir}/{model}-{self.dataset}-time.csv"
+            )
+            times_df.append(time_method_df)
+        fcsts_df = pd.concat(fcsts_df, axis=1).reset_index()
+        fcsts_df["ds"] = pd.to_datetime(fcsts_df["ds"])
+        times_df = pd.concat(times_df)
+        test_df = test_df.merge(fcsts_df, how="left")
+        assert test_df.isna().sum().sum() == 0, "merge contains nas"
+        # point evaluation
+        point_fcsts_cols = ["unique_id", "ds", "y"] + models
+        test_df["unique_id"] = test_df["unique_id"].astype(str)
+        train_df["unique_id"] = train_df["unique_id"].astype(str)
+        mase_seas = partial(mase, seasonality=self.seasonality)
+        eval_df = evaluate(
+            test_df[point_fcsts_cols],
+            train_df=train_df,
+            metrics=[smape, mase_seas],
+        )
+        # probabilistic evaluation
+        eval_prob_df = []
+        for q in self.quantiles:
+            prob_cols = [f"{model}-q-{q}" for model in models]
+            eval_q_df = quantile_loss(test_df, models=prob_cols, q=q)
+            eval_q_df[prob_cols] = eval_q_df[prob_cols] * self.horizon
+            eval_q_df = eval_q_df.rename(columns=dict(zip(prob_cols, models)))
+            eval_q_df["metric"] = f"quantile-loss-{q}"
+            eval_prob_df.append(eval_q_df)
+        eval_prob_df = pd.concat(eval_prob_df)
+        eval_prob_df = eval_prob_df.groupby("metric").sum().reset_index()
+        total_y = test_df["y"].sum()
+        eval_prob_df[models] = eval_prob_df[models] / total_y
+        eval_prob_df["metric"] = "scaled_crps"
+        eval_df = pd.concat([eval_df, eval_prob_df]).reset_index(drop=True)
+        eval_df = eval_df.groupby("metric").mean(numeric_only=True).reset_index()
+        eval_df = eval_df.melt(id_vars="metric", value_name="value", var_name="model")
+        times_df.insert(0, "metric", "time")
+        times_df = times_df.rename(columns={"time": "value"})
+        eval_df = pd.concat([eval_df, times_df])
+        eval_df.insert(0, "dataset", self.dataset)
+        eval_df = eval_df.sort_values(["dataset", "metric", "model"])
+        eval_df = eval_df.reset_index(drop=True)
+        return eval_df
--- a/experiments/azure-automl-forecasting/.env.example
+++ b/experiments/azure-automl-forecasting/.env.example
+AZURE_SUBSCRIPTION_ID=
+AZURE_RESOURCE_GROUP=
+AZURE_WORKSPACE_NAME=
+TIMEGPT_TOKEN=
+
--- a/experiments/azure-automl-forecasting/Makefile
+++ b/experiments/azure-automl-forecasting/Makefile
+TS_FILES := Hourly_H.parquet Daily_D.parquet Weekly_W-MON.parquet Monthly_MS.parquet 
+FILTERED_TS_FILES := $(patsubst %,./data/filtered_datasets/%,$(TS_FILES))
+
+filter_data:
+	@for file in $(TS_FILES); do \
+		python -m src.utils.filter_data --dataset_path ./data/$$file; \
+	done
+
+run_timegpt: .require-dataset_path
+	@echo Running TimeGPT with dataset_path=$(dataset_path)
+	@python -m src.nixtla_timegpt --dataset_path $(dataset_path)
+
+run_sn: .require-dataset_path
+	@echo Running SN with dataset_path=$(dataset_path)
+	@python -m src.statsforecast_sn --dataset_path $(dataset_path)
+
+run_automl: .require-dataset_path
+	@echo Running AutoML with dataset_path=$(dataset_path)
+	@python -m src.azure_automl.forecasting --dataset_path $(dataset_path)
+
+run_methods:
+	@for file in $(TS_FILES); do \
+		echo "Running methods for $$file"; \
+		$(MAKE) run_timegpt dataset_path=./data/filtered_datasets/$$file; \
+		$(MAKE) run_sn dataset_path=./data/filtered_datasets/$$file; \
+		$(MAKE) run_automl dataset_path=./data/filtered_datasets/$$file; \
+	done
+
+download_automl_forecasts:
+	@python -m src.azure_automl.download_forecasts
+
+evaluate_experiments:
+	@python -m src.evaluation --datasets_paths "$(shell echo $(FILTERED_TS_FILES) | tr ' ' ',')"
+
+.require-dataset_path:
+ifndef dataset_path
+	$(error dataset_path is required)
+endif
+
--- a/experiments/azure-automl-forecasting/README.md
+++ b/experiments/azure-automl-forecasting/README.md
+# Nixtla TimeGPT vs. Azure AutoML: A Comprehensive Performance Analysis
+
+This experiment evaluates the performance of **Nixtla TimeGPT's zero-shot inference** against **Microsoft's Azure AutoML** in the domain of time series forecasting. Our analysis shows that TimeGPT **surpasses Azure AutoML by 12%, 12%, and 10% in MAE, RMSE, and MASE metrics** and has **300x improvement in computational efficiency**. This evaluation spanned over 3,000 distinct time series across various data frequencies, with considerations for Azure AutoML's cost constraints.
+
+# Introduction
+
+[Azure AutoML](https://learn.microsoft.com/en-us/azure/machine-learning/concept-automl-forecasting-methods?view=azureml-api-2), a product of Microsoft, offers a robust automated machine-learning solution that caters to a wide array of predictive tasks, including time series forecasting. TimeGPT is a foundational model for time series forecasting that can be accessed [through an API](https://docs.nixtla.io/). While Azure AutoML is known for its adaptability and ease of use, our findings reveal that TimeGPT offers superior accuracy and efficiency, especially in the context of time series data.
+
+## Empirical Evaluation
+
+Our study involved a detailed comparison of both models across various datasets, including Hourly, Daily, Weekly, and Monthly data frequencies. The datasets were chosen from the test set of the [TimeGPT-1 paper](https://arxiv.org/abs/2310.03589), ensuring a diverse set of time series for evaluation. The selection process was designed to manage computational complexity and adhere to Azure AutoML's dataset size requirements, with a cap of 3,000 observations to maintain cost-effectiveness.
+
+## Results
+
+The following table shows the main findings of our analysis, presenting a comparison of performance metrics (MASE, MAE, RMSE) and computational time (in seconds) across different datasets. The best results are highlighted in **bold** for clarity.
+
+<img width="632" alt="image" src="https://github.com/Nixtla/nixtla/assets/10517170/0cc4285e-2572-4f08-9846-94c68ad72e8b">
+
+
+## Reproducibility
+
+All experiments were conducted in controlled environments to uphold the integrity and reproducibility of our results. TimeGPT evaluations were performed using a 2020 MacBook Air with an M1 chip, ensuring accessibility and practicality. In contrast, Azure AutoML experiments were carried out on a cluster of 11 STANDARD_DS5_V2 virtual machines equipped with substantial computational resources to showcase its scalability and power.
+
+### Instructions
+
+1. Configure Azure AutoML according to the official Microsoft documentation.
+2. Set the environment variables in a `.env` file using `.env.example` as example.
+3. Set up a conda environment using:
+
+```bash
+mamba create -n azure-automl-fcst python=3.10
+conda activate azure-automl-fcst
+pip install uv
+uv pip install -r requirements.txt
+```
+
+4. Download the data using
+
+```python
+python -m src.utils.download_data
+```
+
+If you're interested in replicating the results, write us at `support@nixtla.io` to give you access to the data.
+
+5. Filter the datasets to prevent AzureML from crashing
+
+```
+make filter_data
+```
+
+6. Run the forecasting tasks for TimeGPT, SeasonalNaive, and AzureAutoML using the following:
+
+```
+make run_methods
+```
+
+Notice that AzureAutoML will send the job to the predefined cluster. 
+
+7. Retrieve AzureAutoML forecasts once they are ready:
+
+```
+make download_automl_forecasts
+```
+
+8. Run evaluation
+
+```
+make evaluate_experiments
+```
+
+
+### References
+- [TimeGPT 1](https://arxiv.org/abs/2310.03589)
+- [StatsForecast](https://github.com/Nixtla/statsforecast/)
+- [Distributed AzureAutoML for forecasting](https://github.com/Azure/azureml-examples/blob/main/sdk/python/jobs/pipelines/1k_demand_forecasting_with_pipeline_components/automl-forecasting-demand-many-models-in-pipeline/automl-forecasting-demand-many-models-in-pipeline.ipynb)
--- a/experiments/azure-automl-forecasting/requirements.txt
+++ b/experiments/azure-automl-forecasting/requirements.txt
+azure-ai-ml
+azure-identity
+azureml-core
+fire
+mltable
+nixtla
+pandas
+python-dotenv
+rich
+statsforecast
+utilsforecast
--- a/experiments/azure-automl-forecasting/src/azure_automl/__init__.py
+++ b/experiments/azure-automl-forecasting/src/azure_automl/__init__.py
--- a/experiments/azure-automl-forecasting/src/azure_automl/automl_handler.py
+++ b/experiments/azure-automl-forecasting/src/azure_automl/automl_handler.py
+import json
+import logging
+import os
+import yaml
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import numpy as np
+import pandas as pd
+from azure.ai.ml import Input
+from azure.ai.ml import MLClient
+from azure.ai.ml.constants import AssetTypes
+from azure.ai.ml.dsl import pipeline
+from azure.ai.ml.entities import AmlCompute, Job
+from azure.identity import DefaultAzureCredential
+from dotenv import load_dotenv
+
+load_dotenv()
+logging.basicConfig(level=logging.INFO)
+main_logger = logging.getLogger(__name__)
+
+loggers = logging.Logger.manager.loggerDict
+for logger_name in loggers:
+    if logger_name.startswith("azure"):
+        logger = logging.getLogger(logger_name)
+        logger.disabled = True
+        logger.propagate = False
+
+
+def str_to_datetime(date_str: str) -> pd.Timestamp:
+    return pd.Timestamp(date_str)
+
+
+def df_to_parquet_azureml_input(df: pd.DataFrame, dir: str) -> Input:
+    series_path = Path(dir) / "series.parquet"
+    df.to_parquet(series_path, index=False)
+    table_data_input = Input(type=AssetTypes.URI_FOLDER, path=dir)
+    return table_data_input
+
+
+def config_to_yaml_azureml_input(config: dict, dir: str) -> Input:
+    config_path = Path(dir) / "config.yaml"
+    with open(config_path, "w") as f:
+        yaml.dump(config, f)
+    config = Input(type="uri_file", path=str(config_path))
+    return config
+
+
+class AzureAutoML:
+    """
+    Before using this class, you need to login to Azure.
+    Use the following command to login:
+    $ az login
+    """
+
+    def __init__(
+        self,
+        subscription_id: str,
+        resource_group_name: str,
+        workspace_name: str,
+    ):
+        self.subscription_id = subscription_id
+        self.resource_group_name = resource_group_name
+        self.workspace_name = workspace_name
+
+    @classmethod
+    def from_environment(cls) -> "AzureAutoML":
+        return cls(
+            subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
+            resource_group_name=os.environ["AZURE_RESOURCE_GROUP"],
+            workspace_name=os.environ["AZURE_WORKSPACE_NAME"],
+        )
+
+    def get_ml_client(self, registry_name: str | None = None) -> MLClient:
+        kwargs = {}
+        if not registry_name:
+            kwargs["workspace_name"] = self.workspace_name
+        else:
+            kwargs["registry_name"] = registry_name
+        credential = DefaultAzureCredential(exclude_managed_identity_credential=True)
+        ml_client = MLClient(
+            credential=credential,
+            subscription_id=self.subscription_id,
+            resource_group_name=self.resource_group_name,
+            **kwargs,
+        )
+        return ml_client
+
+    def get_train_and_inference_components(self) -> tuple:
+        ml_client_reqistry = self.get_ml_client("azureml")
+        train_component = ml_client_reqistry.components.get(
+            name="automl_many_models_training",
+            label="latest",
+        )
+        inference_component = ml_client_reqistry.components.get(
+            name="automl_many_models_inference",
+            label="latest",
+        )
+        return train_component, inference_component
+
+    def forecast(
+        self,
+        df: pd.DataFrame,
+        df_test: pd.DataFrame,
+        aml_compute: AmlCompute,
+        h: int,
+        freq: str,
+        id_col: str = "unique_id",
+        time_col: str = "ds",
+        target_col: str = "y",
+        primary_metric: str = "normalized_root_mean_squared_error",
+        n_cross_validations: str | int = "auto",
+        experiment_name: str | None = None,
+        begin_create_or_update_aml_compute: bool = False,
+        max_trials: int = 25,
+        enable_early_stopping: bool = True,
+        max_nodes: int = 1,
+        max_concurrency_per_node: int = 1,
+        forecast_mode: str = "rolling",
+        retrain_failed_model: bool = False,
+    ) -> str:
+        if experiment_name is None:
+            random_id = np.random.randint(10000, 99999)
+            experiment_name = f"automl-forecasting-job-{random_id}"
+        ml_client = self.get_ml_client()
+        train_component, inference_component = self.get_train_and_inference_components()
+        automl_config_dict = dict(
+            task="forecasting",
+            forecast_horizon=h,
+            forecast_step=h,
+            frequency=freq,
+            time_series_id_column_names=id_col,
+            partition_column_names=[id_col],
+            time_column_name=time_col,
+            label_column_name=target_col,
+            primary_metric=primary_metric,
+            n_cross_validations=n_cross_validations,
+            max_trials=max_trials,
+            enable_early_stopping=enable_early_stopping,
+            track_child_runs=False,
+            allow_multi_partitions=False,
+            #            allowed_training_algorithms=["Naive"],
+        )
+
+        @pipeline(description="pipeline for automl forecasting")
+        def forecasting_pipeline(
+            training_data: Input,
+            test_data: Input,
+            automl_config: Input,
+            compute_name: str,
+        ):
+            # training node
+            training_node = train_component(
+                raw_data=training_data,
+                automl_config=automl_config,
+                max_concurrency_per_node=max_concurrency_per_node,
+                max_nodes=max_nodes,
+                retrain_failed_model=retrain_failed_model,
+                compute_name=compute_name,
+            )
+            # inference node
+            inference_node = inference_component(
+                raw_data=test_data,
+                max_nodes=max_nodes,
+                max_concurrency_per_node=max_concurrency_per_node,
+                optional_train_metadata=training_node.outputs.run_output,
+                forecast_mode=forecast_mode,
+                forecast_step=h,
+                compute_name=compute_name,
+            )
+            return {"forecast_output": inference_node.outputs.raw_predictions}
+
+        if begin_create_or_update_aml_compute:
+            main_logger.info("Begin create or update aml compute")
+            ml_client.compute.begin_create_or_update(aml_compute).result()
+
+        cwd = Path.cwd()
+        with TemporaryDirectory(dir=cwd) as tmp_dir, TemporaryDirectory(
+            dir=cwd
+        ) as tmp_dir_test, TemporaryDirectory(dir=cwd) as tmp_dir_config:
+            main_logger.info("Transforming datasets to parquet")
+            table_data_input = df_to_parquet_azureml_input(df, dir=tmp_dir)
+            table_data_input_test = df_to_parquet_azureml_input(
+                df_test,
+                dir=tmp_dir_test,
+            )
+            automl_config = config_to_yaml_azureml_input(
+                automl_config_dict,
+                dir=tmp_dir_config,
+            )
+            pipeline_job = forecasting_pipeline(
+                training_data=table_data_input,
+                test_data=table_data_input_test,
+                automl_config=automl_config,
+                compute_name=aml_compute.name,
+            )
+            pipeline_job.settings.default_compute = aml_compute.name
+            main_logger.info("Begin submitting pipeline job")
+            returned_pipeline_job = ml_client.jobs.create_or_update(
+                pipeline_job,
+                experiment_name=experiment_name,
+            )
+        return returned_pipeline_job.name
+
+    def get_job(self, job_name: str) -> Job:
+        ml_client = self.get_ml_client()
+        job = ml_client.jobs.get(job_name)
+        return job
+
+    def get_job_status(self, job_name: str) -> str | None:
+        job = self.get_job(job_name)
+        return job.status
+
+    def get_job_total_time(self, job_name: str) -> float | None:
+        job = self.get_job(job_name)
+        if job.status == "NotStarted":
+            main_logger.info(f"Job {job_name} is not started yet")
+            return None
+        stages_key = "azureml.pipelines.stages"
+        if stages_key not in job.properties:
+            main_logger.info(f"Job {job_name} has no stages yet")
+            return None
+        stages = json.loads(job.properties[stages_key])
+        execution_info = stages["Execution"]
+        status = execution_info["Status"]
+        if status == "Failed":
+            raise Exception(f"Job {job_name} failed")
+        start_time = str_to_datetime(execution_info["StartTime"])
+        if "EndTime" not in execution_info:
+            total_time = pd.Timestamp.now(tz=start_time.tz) - start_time
+            main_logger.info(
+                f"Job has status {status}, total time so far: {total_time.total_seconds()}"
+            )
+        end_time = str_to_datetime(execution_info["EndTime"])
+        total_time = end_time - start_time
+        return total_time.total_seconds()
+
+    def get_forecast_df(self, job_name: str) -> pd.DataFrame | None:
+        job_status = self.get_job_status(job_name)
+        if job_status != "Completed":
+            main_logger.info(f"Job {job_name} is not completed yet")
+            return None
+        ml_client = self.get_ml_client()
+        cwd = Path.cwd()
+        with TemporaryDirectory(dir=cwd) as tmp_dir:
+            ml_client.jobs.download(
+                job_name,
+                download_path=tmp_dir,
+                output_name="forecast_output",
+            )
+            output_path = Path(tmp_dir) / "named-outputs" / "forecast_output"
+            forecast_df = pd.read_parquet(output_path)
+        return forecast_df
--- a/experiments/azure-automl-forecasting/src/azure_automl/download_forecasts.py
+++ b/experiments/azure-automl-forecasting/src/azure_automl/download_forecasts.py
+import logging
+from pathlib import Path
+
+import fire
+
+from .automl_handler import AzureAutoML
+from .forecasting import AzureAutoMLJobs
+from src.utils.data_handler import ForecastDataset
+
+logging.basicConfig(level=logging.INFO)
+main_logger = logging.getLogger(__name__)
+
+
+def download_forecasts(dir: str = "./results"):
+    azure_automl = AzureAutoML.from_environment()
+    azure_automl_experiments = AzureAutoMLJobs()
+    results_path = Path(dir) / "azure_automl"
+
+    jobs_df = azure_automl_experiments.get_jobs_df()
+    jobs_df = jobs_df.sort_values("created_at", ascending=False).drop_duplicates(
+        "experiment_name"
+    )
+
+    for _, row in jobs_df.iterrows():
+        experiment_name = row.experiment_name
+        job_name = row.job_name
+        main_logger.info(
+            f"Downloading forecasts for experiment {experiment_name} and job {job_name}"
+        )
+        try:
+            forecast_df = azure_automl.get_forecast_df(job_name)
+            total_time = azure_automl.get_job_total_time(job_name)
+        except Exception:
+            main_logger.info(
+                f"Failed to download forecasts for experiment {experiment_name} and job {job_name}"
+            )
+            continue
+        if forecast_df is None:
+            main_logger.info(
+                f"Failed to download forecasts for experiment {experiment_name} and job {job_name}"
+                "probably because the job is not finished yet or failed"
+            )
+            continue
+        fcst_dataset = ForecastDataset(forecast_df=forecast_df, total_time=total_time)
+        experiment_name = row.experiment_name
+        fcst_dataset.save_to_dir(results_path / experiment_name)
+        main_logger.info(
+            f"Saved forecasts for experiment {experiment_name} and job {job_name}"
+        )
+
+
+if __name__ == "__main__":
+    fire.Fire(download_forecasts)
--- a/experiments/azure-automl-forecasting/src/azure_automl/forecasting.py
+++ b/experiments/azure-automl-forecasting/src/azure_automl/forecasting.py
+from pathlib import Path
+
+import fire
+import pandas as pd
+from azure.ai.ml.entities import AmlCompute
+
+from .automl_handler import AzureAutoML
+from src.utils.data_handler import ExperimentDataset
+
+
+class AzureAutoMLJobs:
+    """
+    This class stores and updates the Azure AutoML Experiments,
+    to keep track of the pipeline jobs.
+    We need this to later downlaod the forecasts.
+    """
+
+    file_name = "forecasting_jobs.csv"
+
+    def __init__(self, dir: str = "./azure_automl_results"):
+        self.dir = dir
+        self.jobs_path = Path(self.dir) / self.file_name
+        self.setup()
+
+    def setup(self):
+        self.jobs_path.parent.mkdir(parents=True, exist_ok=True)
+        if not self.jobs_path.exists():
+            pd.DataFrame(columns=["created_at", "experiment_name", "job_name"]).to_csv(
+                self.jobs_path,
+                index=False,
+            )
+
+    def get_jobs_df(self) -> pd.DataFrame:
+        return pd.read_csv(self.jobs_path)
+
+    def save_job(self, job_name: str, experiment_name: str):
+        jobs_df = self.get_jobs_df()
+        new_row = pd.DataFrame(
+            {
+                "created_at": [pd.Timestamp.now()],
+                "experiment_name": [experiment_name],
+                "job_name": [job_name],
+            }
+        )
+        jobs_df = pd.concat([jobs_df, new_row])
+        jobs_df.to_csv(self.jobs_path, index=False)
+
+
+def start_forecasting_job(
+    dataset_path: str,
+    begin_create_or_update_aml_compute: bool = False,
+):
+    experiment_name = dataset_path.split("/")[-1].split(".")[0]
+    dataset = ExperimentDataset.from_parquet(parquet_path=dataset_path)
+    azure_automl = AzureAutoML.from_environment()
+    azure_automl_jobs = AzureAutoMLJobs()
+
+    aml_compute = AmlCompute(
+        name="azure-automl-fcst-cluster-nixtla",
+        min_instances=11,
+        max_instances=11,
+        size="STANDARD_DS5_V2",
+    )
+
+    job_name = azure_automl.forecast(
+        df=dataset.Y_df_train,
+        df_test=dataset.Y_df_test,
+        aml_compute=aml_compute,
+        h=dataset.horizon,
+        freq=dataset.pandas_frequency,
+        n_cross_validations=2,
+        experiment_name=experiment_name,
+        begin_create_or_update_aml_compute=begin_create_or_update_aml_compute,
+        max_nodes=11,
+        max_concurrency_per_node=8,
+    )
+
+    azure_automl_jobs.save_job(job_name, experiment_name)
+
+
+if __name__ == "__main__":
+    fire.Fire(start_forecasting_job)
--- a/experiments/azure-automl-forecasting/src/evaluation.py
+++ b/experiments/azure-automl-forecasting/src/evaluation.py
+import logging
+from pathlib import Path
+from typing import List
+from unicodedata import numeric
+
+import fire
+import pandas as pd
+from rich.console import Console
+from rich.table import Table
+
+from src.utils.data_handler import ExperimentDataset, ForecastDataset
+
+logging.basicConfig(level=logging.INFO)
+main_logger = logging.getLogger(__name__)
+
+
+def print_df_rich(df: pd.DataFrame):
+    console = Console()
+    table = Table()
+    for col in df.select_dtypes(include=["float"]).columns:
+        df[col] = df[col].apply(lambda x: f"{x:.3f}")
+    for col in df.columns:
+        table.add_column(col)
+    for row in df.itertuples(index=False):
+        table.add_row(*row)
+    console.print(table)
+
+
+METHODS = {
+    "azure_automl": "automl_prediction",
+    "nixtla_timegpt": "TimeGPT",
+    "statsforecast_sn": "SeasonalNaive",
+}
+
+
+def get_model_name(method: str) -> str:
+    if method not in METHODS:
+        raise ValueError(f"Invalid method: {method}")
+    return METHODS[method]
+
+
+def evaluate_experiments(
+    datasets_paths: str,
+    methods_to_evaluate: List[str] = list(METHODS.keys()),
+    results_dir: str = "./results",
+):
+    datasets_paths_ = datasets_paths.split(",")
+    eval_datasets_df: pd.DataFrame | None = None
+    for dataset_path in datasets_paths_:
+        experiment_name = dataset_path.split("/")[-1].split(".")[0]
+        eval_method_df: pd.DataFrame | None = None
+        dataset: None | ExperimentDataset = None
+        for method in methods_to_evaluate:
+            results_experiment_dir = Path(results_dir) / method / experiment_name
+            if ForecastDataset.is_forecast_ready(results_experiment_dir):
+                main_logger.info(
+                    f"Evaluating experiment {experiment_name} and method {method}"
+                )
+                forecast_dataset = ForecastDataset.from_dir(results_experiment_dir)
+                if dataset is None:
+                    dataset = ExperimentDataset.from_parquet(parquet_path=dataset_path)
+                eval_df = dataset.evaluate_forecast_df(
+                    forecast_df=forecast_dataset.forecast_df,
+                    model=get_model_name(method),
+                    total_time=forecast_dataset.total_time,
+                )
+                if eval_method_df is None:
+                    eval_method_df = eval_df
+                else:
+                    eval_method_df = pd.concat(
+                        [eval_method_df, eval_df],
+                        axis=1,
+                    )  # type: ignore
+            else:
+                main_logger.info(
+                    f"Skipping evaluation for experiment {experiment_name} and method {method}"
+                    " because the forecasts are not ready yet"
+                )
+        if eval_method_df is not None:
+            eval_method_df.reset_index(inplace=True)
+            eval_method_df.insert(0, "dataset", experiment_name)
+            if eval_datasets_df is None:
+                eval_datasets_df = eval_method_df
+            else:
+                eval_datasets_df = pd.concat(
+                    [eval_datasets_df, eval_method_df],
+                    ignore_index=True,
+                )  # type: ignore
+    if eval_datasets_df is not None:
+        azure_renamer = {"automl_prediction": "AzureAutoML"}
+        if "azure_automl" in methods_to_evaluate:
+            eval_datasets_df = eval_datasets_df.rename(columns=azure_renamer)
+        eval_datasets_df.to_csv(Path(results_dir) / "eval_datasets.csv", index=False)
+        eval_datasets_df["metric"] = (
+            eval_datasets_df["metric"].str.upper().str.replace("TOTAL_", "")
+        )
+        # scale by SeasonalNaive
+        if "SeasonalNaive" in eval_datasets_df.columns:
+            time_mask = eval_datasets_df["metric"] == "TIME"
+            for model in eval_datasets_df.columns.drop(["dataset", "metric"]):
+                if model == "SeasonalNaive":
+                    continue
+                eval_datasets_df.loc[~time_mask, model] = (
+                    eval_datasets_df.loc[~time_mask, model]
+                    / eval_datasets_df.loc[~time_mask, "SeasonalNaive"]
+                )
+            eval_datasets_df = eval_datasets_df.drop(columns=["SeasonalNaive"])
+
+        def pivot_df(df: pd.DataFrame, col: str) -> pd.DataFrame:
+            return df.pivot(
+                index="dataset",
+                columns="metric",
+                values=col,
+            )
+
+        result_list = []
+        models = []
+        for method in methods_to_evaluate:
+            if method == "statsforecast_sn":
+                continue
+            if method == "azure_automl":
+                col = "AzureAutoML"
+            else:
+                col = get_model_name(method)
+            pivotted_df = pivot_df(eval_datasets_df, col)
+            result_list.append(pivotted_df)
+            models.append(col)
+        result = pd.concat(result_list, axis=1, keys=models)
+        result = result.swaplevel(axis=1).sort_index(axis=1)
+        flattened_columns = ["_".join(col) for col in result.columns.values]
+        result.columns = flattened_columns
+        result = result.reset_index()
+        print_df_rich(result)
+
+
+if __name__ == "__main__":
+    fire.Fire(evaluate_experiments)
--- a/experiments/azure-automl-forecasting/src/nixtla_timegpt.py
+++ b/experiments/azure-automl-forecasting/src/nixtla_timegpt.py
+import sys
+from pathlib import Path
+from time import time
+
+import fire
+from dotenv import load_dotenv
+from nixtla import NixtlaClient
+
+from src.utils.data_handler import ExperimentDataset, ForecastDataset
+
+load_dotenv()
+
+
+def timegpt_forecast(dataset_path: str, results_dir: str = "./results"):
+    dataset = ExperimentDataset.from_parquet(parquet_path=dataset_path)
+    size_df = sys.getsizeof(dataset.Y_df_train) / (1024 * 1024)
+    max_partition_size_mb = 20
+    num_partitions = int(size_df / max_partition_size_mb) + 1
+    timegpt = NixtlaClient(max_retries=1)
+    start = time()
+    forecast_df = timegpt.forecast(
+        df=dataset.Y_df_train,
+        h=dataset.horizon,
+        freq=dataset.pandas_frequency,
+        model="timegpt-1-long-horizon",
+        num_partitions=num_partitions,
+    )
+    end = time()
+    total_time = end - start
+    forecast_dataset = ForecastDataset(
+        forecast_df=forecast_df,
+        total_time=total_time,
+    )
+    experiment_name = dataset_path.split("/")[-1].split(".")[0]
+    results_path = Path(results_dir) / "nixtla_timegpt" / experiment_name
+    forecast_dataset.save_to_dir(results_path)
+
+
+if __name__ == "__main__":
+    fire.Fire(timegpt_forecast)
--- a/experiments/azure-automl-forecasting/src/statsforecast_sn.py
+++ b/experiments/azure-automl-forecasting/src/statsforecast_sn.py
+import os
+from pathlib import Path
+from time import time
+
+import fire
+from statsforecast import StatsForecast
+from statsforecast.models import SeasonalNaive
+
+from src.utils.data_handler import ExperimentDataset, ForecastDataset
+
+
+def sn_forecast(dataset_path: str, results_dir: str = "./results"):
+    os.environ["NIXTLA_ID_AS_COL"] = "true"
+    dataset = ExperimentDataset.from_parquet(parquet_path=dataset_path)
+    sf = StatsForecast(
+        models=[SeasonalNaive(season_length=dataset.seasonality)],
+        freq=dataset.pandas_frequency,
+    )
+    start = time()
+    forecast_df = sf.forecast(
+        df=dataset.Y_df_train,
+        h=dataset.horizon,
+    )
+    end = time()
+    total_time = end - start
+    forecast_dataset = ForecastDataset(forecast_df=forecast_df, total_time=total_time)
+    experiment_name = dataset_path.split("/")[-1].split(".")[0]
+    results_path = Path(results_dir) / "statsforecast_sn" / experiment_name
+    forecast_dataset.save_to_dir(results_path)
+
+
+if __name__ == "__main__":
+    fire.Fire(sn_forecast)