readme

f42429f6 · bailuo · f42429f6 · f42429f6 · f42429f6 · f42429f6
Commit f42429f6 authored Nov 19, 2025 by bailuo
20 changed files
--- a/experiments/lag-llama/src/lag_llama_pipeline.py
+++ b/experiments/lag-llama/src/lag_llama_pipeline.py
+from time import time
+from typing import Iterable, List, Tuple
+
+import fire
+import pandas as pd
+import torch
+from gluonts.dataset import Dataset
+from gluonts.model.forecast import Forecast
+from gluonts.torch.model.predictor import PyTorchPredictor
+from tqdm import tqdm
+
+from lag_llama.gluon.estimator import LagLlamaEstimator
+from src.utils import ExperimentHandler
+
+
+def get_lag_llama_predictor(
+    prediction_length: int, models_dir: str
+) -> PyTorchPredictor:
+    model_path = f"{models_dir}/lag-llama.ckpt"
+    map_location = torch.device("cuda:0") if torch.cuda.is_available() else "cpu"
+    if map_location == "cpu":
+        raise ValueError("cpu is not supported in lagllama (there is a bug)")
+    ckpt = torch.load(model_path, map_location=map_location)
+    estimator_args = ckpt["hyper_parameters"]["model_kwargs"]
+    # this context length is reported in the paper
+    context_length = 32
+    estimator = LagLlamaEstimator(
+        ckpt_path=model_path,
+        prediction_length=prediction_length,
+        context_length=context_length,
+        # estimator args
+        input_size=estimator_args["input_size"],
+        n_layer=estimator_args["n_layer"],
+        n_embd_per_head=estimator_args["n_embd_per_head"],
+        n_head=estimator_args["n_head"],
+        scaling=estimator_args["scaling"],
+        time_feat=estimator_args["time_feat"],
+    )
+    lightning_module = estimator.create_lightning_module()
+    transformation = estimator.create_transformation()
+    predictor = estimator.create_predictor(transformation, lightning_module)
+    return predictor
+
+
+def gluonts_instance_fcst_to_df(
+    fcst: Forecast,
+    quantiles: List[float],
+    model_name: str,
+) -> pd.DataFrame:
+    point_forecast = fcst.mean
+    h = len(point_forecast)
+    dates = pd.date_range(
+        fcst.start_date.to_timestamp(),
+        freq=fcst.freq,
+        periods=h,
+    )
+    fcst_df = pd.DataFrame(
+        {
+            "ds": dates,
+            "unique_id": fcst.item_id,
+            model_name: point_forecast,
+        }
+    )
+    for q in quantiles:
+        fcst_df[f"{model_name}-q-{q}"] = fcst.quantile(q)
+    return fcst_df
+
+
+def gluonts_fcsts_to_df(
+    fcsts: Iterable[Forecast],
+    quantiles: List[float],
+    model_name: str,
+) -> pd.DataFrame:
+    df = []
+    for fcst in tqdm(fcsts):
+        fcst_df = gluonts_instance_fcst_to_df(fcst, quantiles, model_name)
+        df.append(fcst_df)
+    return pd.concat(df).reset_index(drop=True)
+
+
+def run_lag_llama(
+    gluonts_dataset: Dataset,
+    horizon: int,
+    quantiles: List[float],
+    models_dir: str,
+) -> Tuple[pd.DataFrame, float, str]:
+    init_time = time()
+    predictor = get_lag_llama_predictor(horizon, models_dir)
+    fcsts = predictor.predict(gluonts_dataset, num_samples=100)
+    model_name = "LagLlama"
+    fcsts_df = gluonts_fcsts_to_df(
+        fcsts,
+        quantiles=quantiles,
+        model_name=model_name,
+    )
+    total_time = time() - init_time
+    return fcsts_df, total_time, model_name
+
+
+def main(dataset: str):
+    exp = ExperimentHandler(dataset)
+    fcst_df, total_time, model_name = run_lag_llama(
+        gluonts_dataset=exp.gluonts_train_dataset,
+        horizon=exp.horizon,
+        quantiles=exp.quantiles,
+        models_dir=exp.models_dir,
+    )
+    exp._save_results(fcst_df, total_time, model_name)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/experiments/lag-llama/src/main.py
+++ b/experiments/lag-llama/src/main.py
+import logging
+import subprocess
+
+import pandas as pd
+
+from src.utils import ExperimentHandler
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+not_included_datasets = [
+    "m1_yearly",
+    "m1_quarterly",
+    "m1_monthly",
+    "m3_yearly",
+    "m3_quarterly",
+    "m3_monthly",
+    "m3_other",
+    "m4_yearly",
+    "m4_quarterly",
+    "m4_monthly",
+    "m4_weekly",
+    "m4_daily",
+    "m4_hourly",
+    "tourism_yearly",
+    "tourism_quarterly",
+    "tourism_monthly",
+]
+
+test_paper_datasets = [
+    "pedestrian_counts",
+    "weather",
+]
+
+datasets = {
+    "not_included": not_included_datasets,
+    "test_set": test_paper_datasets,
+}
+
+
+def evaluate():
+    eval_df = []
+    prefix_process = ["python", "-m"]
+
+    for name_group, groups in datasets.items():
+        for dataset in groups:
+            logger.info(f"Evaluating {dataset}...")
+            suffix_process = ["--dataset", dataset]
+            process = (
+                lambda middle_process: prefix_process + middle_process + suffix_process
+            )
+            # running statsforecast and lagllama in separated
+            # processes because gluonts sets multiprocessing context
+            # see: https://github.com/awslabs/gluonts/blob/dev/src/gluonts/torch/__init__.py
+            logger.info("Running SeasonalNaive")
+            subprocess.run(process(["src.statsforecast_pipeline"]))
+            logger.info("Running LagLLama")
+            subprocess.run(process(["src.lag_llama_pipeline"]))
+            logger.info("Running dataset evaluation")
+            exp = ExperimentHandler(dataset)
+            eval_dataset_df = exp.evaluate_models(["LagLlama", "SeasonalNaive"])
+            eval_dataset_df.insert(0, "paper", name_group)
+            eval_df.append(eval_dataset_df)
+    eval_df = pd.concat(eval_df).reset_index(drop=True)
+    exp.save_dataframe(eval_df, "complete-results.csv")
+
+
+if __name__ == "__main__":
+    evaluate()
--- a/experiments/lag-llama/src/statsforecast_pipeline.py
+++ b/experiments/lag-llama/src/statsforecast_pipeline.py
+import os
+from time import time
+from typing import List, Tuple
+
+import fire
+import pandas as pd
+from statsforecast import StatsForecast
+from statsforecast.models import SeasonalNaive
+
+from src.utils import ExperimentHandler
+
+
+def run_statsforecast(
+    train_df: pd.DataFrame,
+    horizon: int,
+    freq: str,
+    seasonality: int,
+    level: List[int],
+) -> Tuple[pd.DataFrame, float, str]:
+    os.environ["NIXTLA_ID_AS_COL"] = "true"
+    models = [SeasonalNaive(season_length=seasonality)]
+    init_time = time()
+    sf = StatsForecast(
+        models=models,
+        freq=freq,
+        n_jobs=-1,
+    )
+    fcsts_df = sf.forecast(df=train_df, h=horizon, level=level)
+    total_time = time() - init_time
+    model_name = repr(models[0])
+    return fcsts_df, total_time, model_name
+
+
+def main(dataset: str):
+    exp = ExperimentHandler(dataset)
+    fcst_df, total_time, model_name = run_statsforecast(
+        train_df=exp.train_df,
+        horizon=exp.horizon,
+        freq=exp.freq,
+        seasonality=exp.seasonality,
+        level=exp.level,
+    )
+    fcst_df = exp._fcst_from_level_to_quantiles(fcst_df, model_name)
+    exp._save_results(fcst_df, total_time, model_name)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/experiments/lag-llama/src/utils.py
+++ b/experiments/lag-llama/src/utils.py
+from functools import partial
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import pandas as pd
+from gluonts.dataset import Dataset
+from gluonts.dataset.repository.datasets import (
+    get_dataset,
+    dataset_names as gluonts_datasets,
+)
+from gluonts.time_feature.seasonality import get_seasonality
+from utilsforecast.evaluation import evaluate
+from utilsforecast.losses import mase, mape
+
+
+def quantile_loss(
+    df: pd.DataFrame,
+    models: list,
+    q: float = 0.5,
+    id_col: str = "unique_id",
+    target_col: str = "y",
+) -> pd.DataFrame:
+    delta_y = df[models].sub(df[target_col], axis=0)
+    res = (
+        np.maximum(q * delta_y, (q - 1) * delta_y)
+        .groupby(df[id_col], observed=True)
+        .mean()
+    )
+    res.index.name = id_col
+    res = res.reset_index()
+    return res
+
+
+class ExperimentHandler:
+    def __init__(
+        self,
+        dataset: str,
+        # default quantiles taken from https://github.com/awslabs/gluonts/blob/08ab434b1e0946c21010ddbb4248288f8f043599/src/gluonts/evaluation/_base.py#L90C5-L90C68
+        quantiles: List[float] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
+        results_dir: str = "./results",
+        models_dir: str = "./models",
+    ):
+        if dataset not in gluonts_datasets:
+            raise Exception(
+                f"dataset {dataset} not found in gluonts "
+                f"available datasets: {', '.join(gluonts_datasets)}"
+            )
+        self.dataset = dataset
+        self.quantiles = quantiles
+        self.level = self._transform_quantiles_to_levels(quantiles)
+        self.results_dir = results_dir
+        self.models_dir = models_dir
+        # defining datasets
+        self._maybe_download_m3_file(self.dataset)
+        gluonts_dataset = get_dataset(self.dataset)
+        self.horizon = gluonts_dataset.metadata.prediction_length
+        if self.horizon is None:
+            raise Exception(
+                f"horizon not found for dataset {self.dataset} "
+                "experiment cannot be run"
+            )
+        self.freq = gluonts_dataset.metadata.freq
+        self.seasonality = get_seasonality(self.freq)
+        self.gluonts_train_dataset = gluonts_dataset.train
+        self.gluonts_test_dataset = gluonts_dataset.test
+        self._create_dir_if_not_exists(self.results_dir)
+
+    @staticmethod
+    def _maybe_download_m3_file(dataset: str):
+        if dataset[:2] == "m3":
+            m3_file = Path.home() / ".gluonts" / "datasets" / "M3C.xls"
+            if not m3_file.exists():
+                from datasetsforecast.m3 import M3
+                from datasetsforecast.utils import download_file
+
+                download_file(m3_file.parent, M3.source_url)
+
+    @staticmethod
+    def _transform_quantiles_to_levels(quantiles: List[float]) -> List[int]:
+        level = [
+            int(100 - 200 * q) for q in quantiles if q < 0.5
+        ]  # in this case mean=mediain
+        level = sorted(list(set(level)))
+        return level
+
+    @staticmethod
+    def _create_dir_if_not_exists(directory: str):
+        Path(directory).mkdir(parents=True, exist_ok=True)
+
+    @staticmethod
+    def _transform_gluonts_instance_to_df(
+        ts: dict,
+        last_n: int | None = None,
+    ) -> pd.DataFrame:
+        start_period = ts["start"]
+        start_ds, freq = start_period.to_timestamp(), start_period.freq
+        target = ts["target"]
+        ds = pd.date_range(start=start_ds, freq=freq, periods=len(target))
+        if last_n is not None:
+            target = target[-last_n:]
+            ds = ds[-last_n:]
+        ts_df = pd.DataFrame({"unique_id": ts["item_id"], "ds": ds, "y": target})
+        return ts_df
+
+    @staticmethod
+    def _transform_gluonts_dataset_to_df(
+        gluonts_dataset: Dataset,
+        last_n: int | None = None,
+    ) -> pd.DataFrame:
+        df = pd.concat(
+            [
+                ExperimentHandler._transform_gluonts_instance_to_df(ts, last_n=last_n)
+                for ts in gluonts_dataset
+            ]
+        )
+        df = df.reset_index(drop=True)
+        return df
+
+    @property
+    def train_df(self) -> pd.DataFrame:
+        train_df = self._transform_gluonts_dataset_to_df(self.gluonts_train_dataset)
+        return train_df
+
+    @property
+    def test_df(self) -> pd.DataFrame:
+        test_df = self._transform_gluonts_dataset_to_df(
+            self.gluonts_test_dataset,
+            last_n=self.horizon,
+        )
+        return test_df
+
+    def save_dataframe(self, df: pd.DataFrame, file_name: str):
+        df.to_csv(f"{self.results_dir}/{file_name}", index=False)
+
+    def _save_results(self, fcst_df: pd.DataFrame, total_time: float, model_name: str):
+        self.save_dataframe(
+            fcst_df,
+            f"{model_name}-{self.dataset}-fcst.csv",
+        )
+        time_df = pd.DataFrame({"time": [total_time], "model": model_name})
+        self.save_dataframe(
+            time_df,
+            f"{model_name}-{self.dataset}-time.csv",
+        )
+
+    def _fcst_from_level_to_quantiles(
+        self,
+        fcst_df: pd.DataFrame,
+        model_name: str,
+    ) -> pd.DataFrame:
+        fcst_df = fcst_df.copy()
+        cols = ["unique_id", "ds", model_name]
+        for q in self.quantiles:
+            if q == 0.5:
+                col = f"{model_name}"
+            else:
+                lv = int(100 - 200 * q)
+                hi_or_lo = "lo" if lv > 0 else "hi"
+                lv = abs(lv)
+                col = f"{model_name}-{hi_or_lo}-{lv}"
+            q_col = f"{model_name}-q-{q}"
+            fcst_df[q_col] = fcst_df[col].values
+            cols.append(q_col)
+        return fcst_df[cols]
+
+    def evaluate_models(self, models: List[str]) -> pd.DataFrame:
+        test_df = self.test_df
+        train_df = self.train_df
+        fcsts_df = []
+        times_df = []
+        for model in models:
+            fcst_method_df = pd.read_csv(
+                f"{self.results_dir}/{model}-{self.dataset}-fcst.csv"
+            ).set_index(["unique_id", "ds"])
+            fcsts_df.append(fcst_method_df)
+            time_method_df = pd.read_csv(
+                f"{self.results_dir}/{model}-{self.dataset}-time.csv"
+            )
+            times_df.append(time_method_df)
+        fcsts_df = pd.concat(fcsts_df, axis=1).reset_index()
+        fcsts_df["ds"] = pd.to_datetime(fcsts_df["ds"])
+        times_df = pd.concat(times_df)
+        test_df = test_df.merge(fcsts_df, how="left")
+        assert test_df.isna().sum().sum() == 0, "merge contains nas"
+        # point evaluation
+        point_fcsts_cols = ["unique_id", "ds", "y"] + models
+        test_df["unique_id"] = test_df["unique_id"].astype(str)
+        train_df["unique_id"] = train_df["unique_id"].astype(str)
+        mase_seas = partial(mase, seasonality=self.seasonality)
+        eval_df = evaluate(
+            test_df[point_fcsts_cols],
+            train_df=train_df,
+            metrics=[mape, mase_seas],
+        )
+        # probabilistic evaluation
+        eval_prob_df = []
+        for q in self.quantiles:
+            prob_cols = [f"{model}-q-{q}" for model in models]
+            eval_q_df = quantile_loss(test_df, models=prob_cols, q=q)
+            eval_q_df[prob_cols] = eval_q_df[prob_cols] * self.horizon
+            eval_q_df = eval_q_df.rename(columns=dict(zip(prob_cols, models)))
+            eval_q_df["metric"] = f"quantile-loss-{q}"
+            eval_prob_df.append(eval_q_df)
+        eval_prob_df = pd.concat(eval_prob_df)
+        eval_prob_df = eval_prob_df.groupby("metric").sum().reset_index()
+        total_y = test_df["y"].sum()
+        eval_prob_df[models] = eval_prob_df[models] / total_y
+        eval_prob_df["metric"] = "scaled_crps"
+        eval_df = pd.concat([eval_df, eval_prob_df]).reset_index(drop=True)
+        eval_df = eval_df.groupby("metric").mean(numeric_only=True).reset_index()
+        eval_df = eval_df.melt(id_vars="metric", value_name="value", var_name="model")
+        times_df.insert(0, "metric", "time")
+        times_df = times_df.rename(columns={"time": "value"})
+        eval_df = pd.concat([eval_df, times_df])
+        eval_df.insert(0, "dataset", self.dataset)
+        eval_df = eval_df.sort_values(["dataset", "metric", "model"])
+        eval_df = eval_df.reset_index(drop=True)
+        return eval_df
--- a/experiments/one-billion/README.md
+++ b/experiments/one-billion/README.md
+# Forecasting at Scale: One Billion (1e9) Time Series with TimeGPT ⚡📈
+
+Imagine you're tasked with forecasting for **one billion unique time series**—ranging from retail sales across thousands of stores to sensor data from millions of IoT devices. It's a monumental challenge, requiring not just statistical modeling but also cutting-edge tools to handle the scale and complexity of the data.
+
+This project is a blueprint for scaling such a task, utilizing **Nixtla's foundation models for time series forecasting** and orchestrating the process efficiently using Python and AWS S3. Here's how you can tackle this kind of project.
+
+## The Challenge 🎯
+
+The goal is simple: forecast the future for **one billion different time series**, but the constraints are anything but simple. How do you handle the storage of this data? 🗄️ How do you parallelize the computation efficiently? 💻 And finally, how do you produce results quickly enough to be useful in decision-making? ⏳
+
+### Enter Foundation Models for Time Series 🚀
+
+**Nixtla** offers **TimeGPT** through an API that leverages foundation models capable of handling large-scale forecasting problems. These models are designed for flexibility and speed 🏎️, making them ideal for scenarios where you're dealing with an enormous volume of data and need results at a high cadence. ⚡
+
+## Results 📊
+
+| 📈 **Number of Series** | Number of Processes | ⏳ **CPU Time (hours)** |
+|:-----------------------:|:-------------------:|:------------------:|
+| 1e9                     | 1                | 5.5 |
+| 1e9 | 5 | 1.1 |
+
+## Running the Project 🛠️
+
+### Installation 🧩
+
+1. Install the required Python packages:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+2. Configure AWS credentials so the script can interact with S3:
+   ```bash
+   aws configure
+   ```
+
+### Usage 🏃‍♂️
+
+To generate forecasts, you simply run the following command. Adjust the parameters as needed:
+
+```bash
+python main.py --bucket <your-bucket-name> --prefix <your-s3-prefix> --n_partitions 1000 --series_per_partition 1000000 --n_jobs 5
+```
+
+- **`bucket`**: The S3 bucket where the data is stored.
+- **`prefix`**: The path inside the S3 bucket where the input and output data is stored.
+- **`n_partitions`**: The number of partitions to break the task into.
+- **`series_per_partition`**: The number of time series in each partition.
+- **`n_jobs`**: The number of processes to run in parallel.
+
+### What Happens Behind the Scenes 🔍
+
+The code will:
+
+1. Check if the forecast for each partition has already been generated. ✅
+2. Generate new time series data for each partition. 🧬
+3. Use Nixtla’s API to compute forecasts for each partition. 🔮
+4. Save the results and the time taken to S3. 💾
+
+## Scaling to Billions 🚀
+
+This approach is designed to **scale**—whether you’re forecasting for **one million** or **one billion** series. By partitioning the data, processing it in parallel 🧠, and leveraging foundation models like those provided by Nixtla, you can handle even the most massive forecasting tasks efficiently. ⚙️
+
+### Final Thoughts 💡
+
+Forecasting at scale is no easy feat, but with the right tools, it’s entirely achievable. This project demonstrates how modern time series forecasting techniques can be applied to massive datasets in an efficient, scalable way. By leveraging AWS infrastructure, foundation models, and clever parallel processing, you can forecast the future for billions of unique data series—**unlocking insights** that can power decision-making at an unprecedented scale. 🌍✨
--- a/experiments/one-billion/main.py
+++ b/experiments/one-billion/main.py
+def s3_file_exists(s3, bucket: str, key: str) -> bool:
+    from botocore.exceptions import ClientError
+
+    try:
+        s3.head_object(Bucket=bucket, Key=key)
+        return True
+    except ClientError as e:
+        if e.response["Error"]["Code"] != "404":
+            raise e
+    return False
+
+
+def read_s3_file(s3, bucket: str, key: str) -> str:
+    resp = s3.get_object(Bucket=bucket, Key=key)
+    return resp["Body"].read().decode("utf-8")
+
+
+def forecast_partition(
+    i: int,
+    version: str,
+    bucket: str,
+    prefix: str,
+    n_series: int,
+    freq: str = "D",
+    h: int = 7,
+) -> None:
+    import logging
+    import math
+    import time
+
+    import boto3
+    import pandas as pd
+    from nixtla import NixtlaClient as V2Client
+    from nixtlats import NixtlaClient as V1Client
+    from tqdm.auto import tqdm
+    from utilsforecast.data import generate_series
+
+    s3 = boto3.client("s3")
+    # only process if we haven't saved the time
+    time_key = f"{prefix}/times/{version}/{i}.txt"
+    if s3_file_exists(s3, bucket, time_key):
+        print(f"{i}-th partition already processed, skipping.")
+        return
+    logging.getLogger("nixtla").setLevel(logging.ERROR)
+    logging.getLogger("nixtlats").setLevel(logging.ERROR)
+
+    series = generate_series(
+        n_series=n_series,
+        freq=freq,
+        min_length=100,
+        max_length=200,
+        seed=i,
+    )
+    series["unique_id"] = series["unique_id"].astype("uint32") + i * n_series
+
+    start = time.perf_counter()
+    if version == "v1":
+        client = V1Client()
+        # v1 is slower when partitioning, so we do this sequentially
+        num_partitions = math.ceil(n_series / 50_000)
+        uids = series["unique_id"].unique()
+        n_ids = uids.size
+        ids_per_part = math.ceil(n_ids / num_partitions)
+        results = []
+        for j in tqdm(range(0, n_ids, ids_per_part)):
+            part_uids = uids[j : j + ids_per_part]
+            part = series[series["unique_id"].isin(part_uids)]
+            results.append(client.forecast(df=part, h=h, freq=freq))
+        forecast = pd.concat(results)
+    else:
+        client = V2Client()
+        num_partitions = math.ceil(n_series / 100_000)
+        forecast = client.forecast(
+            df=series, h=h, freq=freq, num_partitions=num_partitions
+        )
+    time_taken = "{:.2f}".format(time.perf_counter() - start)
+    forecast.to_parquet(
+        f"s3://{bucket}/{prefix}/output/{version}/{i}.parquet", index=False
+    )
+    s3.put_object(Bucket=bucket, Key=time_key, Body=time_taken)
+    print(f"{i}: {time_taken}")
+
+
+def generate_forecasts(
+    version: str,
+    bucket: str,
+    prefix: str,
+    n_partitions: int,
+    series_per_partition: int,
+    n_jobs: int,
+) -> None:
+    from concurrent.futures import ProcessPoolExecutor
+    from functools import partial
+
+    fn = partial(
+        forecast_partition,
+        version=version,
+        bucket=bucket,
+        prefix=prefix,
+        n_series=series_per_partition,
+    )
+    with ProcessPoolExecutor(n_jobs) as executor:
+        _ = executor.map(fn, range(n_partitions))
+
+
+def read_times(
+    s3, version: str, bucket: str, prefix: str, n_partitions: int
+) -> list[str]:
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+
+    from tqdm.auto import tqdm
+
+    key = f"{prefix}/{version}_times.txt"
+    if s3_file_exists(s3, bucket, key):
+        return read_s3_file(s3, bucket, key).splitlines()
+    with ThreadPoolExecutor() as executor:
+        futures = [
+            executor.submit(
+                read_s3_file, s3, bucket, f"{prefix}/times/{version}/{i}.txt"
+            )
+            for i in range(n_partitions)
+        ]
+        times = []
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            times.append(future.result())
+    s3.put_object(
+        Bucket=bucket,
+        Key=key,
+        Body="\n".join(times),
+    )
+    return times
+
+
+def main(
+    bucket: str = "datasets-nixtla",
+    prefix: str = "one-billion",
+    n_partitions: int = 1_000,
+    series_per_partition: int = 1_000_000,
+    n_jobs: int = 5,
+):
+    import boto3
+    import pandas as pd
+
+    times = {}
+    s3 = boto3.client("s3")
+    for version in ("v1", "v2"):
+        generate_forecasts(
+            version=version,
+            bucket=bucket,
+            prefix=prefix,
+            n_partitions=n_partitions,
+            series_per_partition=series_per_partition,
+            n_jobs=n_jobs,
+        )
+        times[version] = read_times(
+            s3, version=version, bucket=bucket, prefix=prefix, n_partitions=n_partitions
+        )
+    pd.DataFrame(times).to_csv(f"s3://{bucket}/{prefix}/times.csv", index=False)
+
+
+if __name__ == "__main__":
+    import fire
+
+    fire.Fire(main)
--- a/experiments/one-billion/requirements.txt
+++ b/experiments/one-billion/requirements.txt
+boto3
+fire
+nixtla>=0.6.0
+nixtlats==0.5.2
+pandas
+pyarrow
+s3fs
+tqdm
--- a/experiments/prophet/.env.example
+++ b/experiments/prophet/.env.example
+TIMEGPT_TOKEN=
--- a/experiments/prophet/Makefile
+++ b/experiments/prophet/Makefile
+SRC_DIR := data
+EXCLUDE_STRINGS := catalogue
+TS_FILES := $(filter-out $(wildcard $(SRC_DIR)/*$(foreach str,$(EXCLUDE_STRINGS),*$(str)*)), $(wildcard $(SRC_DIR)/*.parquet))
+
+evaluate: .require-method
+	@echo "Evaluation for $${method}..."
+	@for file in $(TS_FILES); do \
+		echo $$file; \
+		python -m src.$${method}_exp --file $$file; \
+	done
+	@echo "Evaluation for $${method} complete."
+
+summarize_results:
+	@echo "Summarize results..."
+	@python -m src.results_summary --dir ./data/results/
+	@echo "Summarize results complete."
+
+.require-method:
+ifndef method
+	$(error method is required)
+endif
--- a/experiments/prophet/README.md
+++ b/experiments/prophet/README.md
+# TimeGPT vs Prophet: Time Series Forecasting Benchmark
+
+## Overview
+
+This repository offers a detailed benchmarking framework for comparing the performance of TimeGPT against Prophet and StatsForecast in time series forecasting. We provide datasets with over 300,000 series across various frequencies, including daily, weekly, 10-minute, and hourly intervals. Users can also incorporate their own datasets for a more personalized analysis. **TimeGPT was not trained on this datasets.**
+
+## Results
+
+The results show that, on average, TimeGPT zero-shot forecasts are **58% more accurate and 92% faster than Prophet's**. The improvements are consistent across 8 frequencies, ranging from minute to quarterly data.
+
+## Notes
+
+- Results were generated using a VM with 96 cores and 196 GB of RAM.
+- Prophet and StatsForecast was executed in paralell.
+- TimeGPT uses the AzureML endpoint.
+- Since the AzureML endpoint does not support GPU and scalable requests, the results can improve.
+
+## Repository Structure
+
+- `/data`: Parquet files with time series data.
+- `/src`: Source code for running benchmarks and experiments.
+- `/data/results`: Outputs and analysis from benchmark runs.
+
+## Data Structure
+
+Datasets should adhere to this structure:
+
+- **unique_id**: Identifier for each series.
+- **ds**: Timestamp of observation.
+- **y**: Target variable for forecasting.
+- **frequency**: Description of data frequency (e.g., 'Daily').
+- **pandas_frequency**: Pandas frequency string (e.g., 'D').
+- **h**: Forecasting horizon. (The last `h` periods of each series will be used as test.)
+- **seasonality**: Seasonality of the series (e.g., 7 for daily).
+
+## Running Experiments
+
+### Makefile
+
+The repository includes a Makefile to streamline the process of running experiments. The key commands are:
+
+1. **evaluate**: Runs the evaluation for a specified method (`timegpt`, `prophet`, or `statsforecast`).
+2. **summarize_results**: Summarizes the results from the evaluation.
+
+### Evaluation Flow
+
+1. **Run Evaluation**: Use `make evaluate method=<method_name>` where `<method_name>` is either `timegpt`, `prophet`, or `statsforecast`. The script filters out files containing specific strings (like 'catalogue') and runs the experiment for each `.parquet` file in the `/data` directory. The results will be written in `/data/results`. 
+
+2. **Summarize Results**: After running evaluations for each method, execute `make summarize_results` to aggregate and summarize the results, which will be written in this `README.md` file.
+
+## Getting Started
+
+1. **Prepare Data**: Ensure your Parquet files are in `/data`. If you want access to the original datasets, please write to `support@nixtla.io` with your use case.
+2. **Create conda environment**: Run `conda env create -f environment.yml` and activate the environment using `conda activate timegpt-benchmark`.
+3. **Run Benchmarks**: Use the Makefile commands to run evaluations and summarize results.
+
+
+## Results
+<This section is automatically generated by results_summary.py>
+
+### Data Description
+
+| file           | frequency   |   n_series |      mean |        std |   min_length |   max_length |       n_obs |
+|:---------------|:------------|-----------:|----------:|-----------:|-------------:|-------------:|------------:|
+| 10Minutely_10T | 10Minutely  |         17 |     2.919 |      6.095 |        3,000 |        3,000 |      51,000 |
+| 30Minutely_30T | 30Minutely  |        556 |     0.233 |      0.329 |        3,000 |        3,000 |   1,668,000 |
+| Daily_D        | Daily       |    103,529 |   178.763 |  5,825.784 |           14 |        3,000 | 251,217,667 |
+| Hourly_H       | Hourly      |        227 |   635.332 |  4,425.693 |          748 |        3,000 |     590,653 |
+| Minutely_T     | Minutely    |         34 |    44.612 |    106.121 |        3,000 |        3,000 |     102,000 |
+| Monthly_MS     | Monthly     |     97,588 | 4,280.461 | 72,939.696 |           24 |        1,456 |   9,116,399 |
+| Quarterly_QS   | Quarterly   |      2,539 | 4,722.366 |  9,521.907 |           18 |          745 |     253,160 |
+| Weekly_W-MON   | Weekly      |     98,144 | 1,388.030 | 99,852.095 |            9 |          399 |  35,096,195 |
+
+### Performance
+
+
+| file           | metric   |   TimeGPT |   Prophet |   SeasonalNaive |
+|:---------------|:---------|----------:|----------:|----------------:|
+| 10Minutely_10T | mae      | **0.976** |     2.758 |             1.0 |
+| 10Minutely_10T | rmse     | **0.764** |     2.005 |             1.0 |
+| 10Minutely_10T | time     | **0.147** |     0.565 |             1.0 |
+|----------------|----------|-----------|-----------|-----------------|
+| 30Minutely_30T | mae      |   **0.6** |     0.661 |             1.0 |
+| 30Minutely_30T | rmse     | **0.652** |     0.687 |             1.0 |
+| 30Minutely_30T | time     | **0.318** |     7.498 |             1.0 |
+|----------------|----------|-----------|-----------|-----------------|
+| Daily_D        | mae      | **0.802** |     1.699 |             1.0 |
+| Daily_D        | rmse     |  **0.78** |     1.479 |             1.0 |
+| Daily_D        | time     | **0.544** |    48.019 |             1.0 |
+|----------------|----------|-----------|-----------|-----------------|
+| Hourly_H       | mae      | **0.855** |     1.124 |             1.0 |
+| Hourly_H       | rmse     | **0.881** |     1.048 |             1.0 |
+| Hourly_H       | time     | **0.134** |     3.426 |             1.0 |
+|----------------|----------|-----------|-----------|-----------------|
+| Minutely_T     | mae      | **0.732** |     1.349 |             1.0 |
+| Minutely_T     | rmse     | **0.705** |     1.207 |             1.0 |
+| Minutely_T     | time     | **0.088** |     0.786 |             1.0 |
+|----------------|----------|-----------|-----------|-----------------|
+| Monthly_MS     | mae      | **0.728** |      1.41 |             1.0 |
+| Monthly_MS     | rmse     | **0.686** |     1.196 |             1.0 |
+| Monthly_MS     | time     |      7.02 |   118.178 |         **1.0** |
+|----------------|----------|-----------|-----------|-----------------|
+| Quarterly_QS   | mae      | **0.966** |     1.384 |             1.0 |
+| Quarterly_QS   | rmse     | **0.974** |     1.313 |             1.0 |
+| Quarterly_QS   | time     |     1.218 |    18.685 |         **1.0** |
+|----------------|----------|-----------|-----------|-----------------|
+| Weekly_W-MON   | mae      | **0.878** |     2.748 |             1.0 |
+| Weekly_W-MON   | rmse     | **0.878** |     2.748 |             1.0 |
+| Weekly_W-MON   | time     |    12.489 |    85.611 |         **1.0** |
+|----------------|----------|-----------|-----------|-----------------|
+<end>
--- a/experiments/prophet/environment.yml
+++ b/experiments/prophet/environment.yml
+name: timegpt-benchmark
+channels:
+  - conda-forge
+dependencies:
+  - jupyterlab
+  - prophet
+  - pyspark>=3.3
+  - python=3.10
+  - pip:
+    - fire
+    - nixtla
+    - python-dotenv
+    - statsforecast
+    - utilsforecast
+    - tabulate
+
--- a/experiments/prophet/src/prophet_exp.py
+++ b/experiments/prophet/src/prophet_exp.py
+from concurrent.futures import ThreadPoolExecutor
+from copy import deepcopy
+from time import time
+from typing import Optional
+
+import fire
+import numpy as np
+import pandas as pd
+from prophet import Prophet as _Prophet
+from utilsforecast.processing import (
+    backtest_splits,
+    drop_index_if_pandas,
+    join,
+    maybe_compute_sort_indices,
+    take_rows,
+    vertical_concat,
+)
+
+from src.tools import ExperimentHandler
+
+
+class ParallelForecaster:
+    def _process_group(self, func, df, **kwargs):
+        uid = df["unique_id"].iloc[0]
+        _df = df.drop("unique_id", axis=1)
+        res_df = func(_df, **kwargs)
+        res_df.insert(0, "unique_id", uid)
+        return res_df
+
+    def _apply_parallel(self, df_grouped, func, **kwargs):
+        results = []
+        with ThreadPoolExecutor(max_workers=None) as executor:
+            futures = [
+                executor.submit(self._process_group, func, df, **kwargs)
+                for _, df in df_grouped
+            ]
+            for future in futures:
+                results.append(future.result())
+        return pd.concat(results)
+
+    def forecast(
+        self,
+        df: pd.DataFrame,
+        h: int,
+        X_df: Optional[pd.DataFrame] = None,
+    ):
+        df_grouped = df.groupby("unique_id")
+        return self._apply_parallel(
+            df_grouped,
+            self._local_forecast,
+            h=h,
+        )
+
+    def cross_validation(
+        self,
+        df: pd.DataFrame,
+        h: int,
+        n_windows: int = 1,
+        step_size: Optional[int] = None,
+        **kwargs,
+    ):
+        df_grouped = df.groupby("unique_id")
+        kwargs = {"h": h, "n_windows": n_windows, "step_size": step_size, **kwargs}
+        return self._apply_parallel(
+            df_grouped,
+            self._local_cross_validation,
+            **kwargs,
+        )
+
+
+class Prophet(_Prophet, ParallelForecaster):
+    def __init__(
+        self,
+        freq: str,
+        alias: str = "Prophet",
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.freq = freq
+        self.alias = alias
+
+    def _local_forecast(
+        self,
+        df: pd.DataFrame,
+        h: int,
+        X_df: Optional[pd.DataFrame] = None,
+    ) -> pd.DataFrame:
+        model = deepcopy(self)
+        model.fit(df=df)
+        future_df = model.make_future_dataframe(
+            periods=h, include_history=False, freq=self.freq
+        )
+        if X_df is not None:
+            future_df = future_df.merge(X_df, how="left")
+        np.random.seed(1000)
+        fcst_df = model.predict(future_df)
+        fcst_df = fcst_df.rename({"yhat": self.alias}, axis=1)
+        fcst_df = fcst_df[["ds", self.alias]]
+        return fcst_df
+
+    def _local_cross_validation(
+        self,
+        df: pd.DataFrame,
+        h: int,
+        n_windows: int = 1,
+        step_size: Optional[int] = None,
+    ) -> pd.DataFrame:
+        df = df.copy()
+        df["ds"] = pd.to_datetime(df["ds"])
+        df.insert(0, "unique_id", "ts_0")
+        # mlforecast cv code
+        results = []
+        sort_idxs = maybe_compute_sort_indices(df, "unique_id", "ds")
+        if sort_idxs is not None:
+            df = take_rows(df, sort_idxs)
+        splits = backtest_splits(
+            df,
+            n_windows=n_windows,
+            h=h,
+            id_col="unique_id",
+            time_col="ds",
+            freq=pd.tseries.frequencies.to_offset(self.freq),
+            step_size=h if step_size is None else step_size,
+        )
+        for i_window, (cutoffs, train, valid) in enumerate(splits):
+            if len(valid.columns) > 3:
+                # if we have uid, ds, y + exogenous vars
+                train_future = valid.drop(columns="y")
+            else:
+                train_future = None
+            y_pred = self._local_forecast(
+                df=train[["ds", "y"]],
+                h=h,
+                X_df=train_future,
+            )
+            y_pred.insert(0, "unique_id", "ts_0")
+            y_pred = join(y_pred, cutoffs, on="unique_id", how="left")
+            result = join(
+                valid[["unique_id", "ds", "y"]],
+                y_pred,
+                on=["unique_id", "ds"],
+            )
+            if result.shape[0] < valid.shape[0]:
+                raise ValueError(
+                    "Cross validation result produced less results than expected. "
+                    "Please verify that the frequency parameter (freq) matches your series' "
+                    "and that there aren't any missing periods."
+                )
+            results.append(result)
+        out = vertical_concat(results)
+        out = drop_index_if_pandas(out)
+        first_out_cols = ["unique_id", "ds", "cutoff", "y"]
+        remaining_cols = [c for c in out.columns if c not in first_out_cols]
+        fcst_cv_df = out[first_out_cols + remaining_cols]
+        return fcst_cv_df.drop(columns="unique_id")
+
+
+def evaluate_experiment(file: str):
+    exp_handler = ExperimentHandler(file=file, method="prophet")
+    Y_df, freq, pandas_freq, h, seasonality = exp_handler.read_data()
+    model_name = "Prophet"
+    print(model_name)
+    prophet = Prophet(freq=pandas_freq)
+    start = time()
+    Y_hat_df = prophet.cross_validation(
+        df=Y_df,
+        h=h,
+        n_windows=1,
+    )
+    total_time = time() - start
+    print(total_time)
+    # evaluation
+    eval_df, total_time_df = exp_handler.evaluate_model(
+        Y_hat_df=Y_hat_df,
+        model_name=model_name,
+        total_time=total_time,
+    )
+    exp_handler.save_results(
+        freq=freq,
+        eval_df=eval_df,
+        total_time_df=total_time_df,
+    )
+
+
+if __name__ == "__main__":
+    fire.Fire(evaluate_experiment)
--- a/experiments/prophet/src/results_summary.py
+++ b/experiments/prophet/src/results_summary.py
+from pathlib import Path
+
+import fire
+from numpy import column_stack
+import pandas as pd
+
+
+def read_kind_results(kind: str, dir: str):
+    files = list(Path(dir).rglob(f"*{kind}.parquet"))
+    df = pd.concat(
+        [pd.read_parquet(file).assign(file=str(file).split("/")[-2]) for file in files],
+        ignore_index=True,
+    )
+    return df
+
+
+def summarize_results_per_file(metrics_df: pd.DataFrame):
+    metrics_df_per_freq = metrics_df.groupby(["file", "metric", "model"]).mean(
+        numeric_only=True
+    )
+    metrics_df_per_freq = metrics_df_per_freq.reset_index()
+    metrics_df_per_freq = metrics_df_per_freq.query(
+        "model in ['Prophet', 'SeasonalNaive', 'TimeGPT']"
+    )
+    models = metrics_df_per_freq["model"].unique()
+    metrics_df_per_freq = pd.pivot(
+        metrics_df_per_freq,
+        index=["file", "metric"],
+        columns="model",
+        values="value",
+    ).reset_index()
+    for model in models:
+        if model == "SeasonalNaive":
+            continue
+        metrics_df_per_freq[model] /= metrics_df_per_freq["SeasonalNaive"]
+    metrics_df_per_freq["SeasonalNaive"] /= metrics_df_per_freq["SeasonalNaive"]
+    return metrics_df_per_freq
+
+
+def prepare_results(df: pd.DataFrame):
+    def bold_best(row):
+        row = row.round(3)
+        models = row.drop(columns=["file", "metric"]).columns
+        best_model = row[models].idxmin(axis=1).item()
+        row[best_model] = "**" + str(row[best_model].item()) + "**"
+        return row
+
+    df_bolded = df.groupby(["file", "metric"]).apply(bold_best)
+    df_bolded = df_bolded.reset_index(drop=True)
+    return df_bolded
+
+
+def write_to_readme(content: str):
+    with open("README.md", "r") as file:
+        readme_content = file.readlines()
+    start_index = -1
+    end_index = -1
+    for i, line in enumerate(readme_content):
+        if line.strip().lower() == "## results":
+            start_index = i + 1
+        if start_index != -1 and line.strip() == "<end>":
+            end_index = i
+            break
+
+    if start_index != -1 and end_index != -1:
+        readme_content = (
+            readme_content[: start_index + 1]
+            + [content + "\n"]
+            + readme_content[end_index:]
+        )
+    else:
+        print("Results section not found or improperly formatted")
+
+    # Write the changes back to the README
+    with open("README.md", "w") as file:
+        file.writelines(readme_content)
+
+
+def summarize_results(dir: str):
+    metrics_df = read_kind_results("metrics", dir)
+    summary_df = read_kind_results("summary", dir)
+    summary_df = (
+        summary_df.set_index(["file", "frequency"])
+        .reset_index()
+        .round(3)
+        .sort_values("frequency")
+    )
+    no_int_cols = ["file", "frequency", "mean", "std"]
+    for col in summary_df.columns:
+        if col not in no_int_cols:
+            summary_df[col] = summary_df[col].astype(int)
+    summary_df = summary_df.to_markdown(index=False, intfmt=",", floatfmt=",.3f")
+    time_df = read_kind_results("time", dir)
+    time_df = time_df.assign(metric="time").rename(columns={"time": "value"})
+    metrics_df_per_file = summarize_results_per_file(metrics_df)
+    time_df = summarize_results_per_file(time_df)
+    eval_df = pd.concat([metrics_df_per_file, time_df], ignore_index=True)
+    eval_df = prepare_results(eval_df)[
+        ["file", "metric", "TimeGPT", "Prophet", "SeasonalNaive"]
+    ]
+    n_files = eval_df["file"].nunique()
+    eval_df = eval_df.to_markdown(
+        index=False,
+        colalign=2 * ["left"] + (eval_df.shape[1] - 2) * ["right"],
+    )
+    markdown_lines = eval_df.split("\n")
+    custom_separator = markdown_lines[1].replace(":", "-")
+    for i in range(4, len(markdown_lines) + n_files - 1, 4):
+        markdown_lines.insert(i + 1, custom_separator)
+    markdown_lines.insert(
+        0,
+        ("\n### Data Description\n\n" f"{summary_df}\n\n" "### Performance\n\n"),
+    )
+    eval_df = "\n".join(markdown_lines)
+    write_to_readme(eval_df)
+
+
+if __name__ == "__main__":
+    fire.Fire(summarize_results)
--- a/experiments/prophet/src/statsforecast_exp.py
+++ b/experiments/prophet/src/statsforecast_exp.py
+from time import time
+
+import fire
+import pandas as pd
+from statsforecast import StatsForecast
+from statsforecast.models import SeasonalNaive, ZeroModel
+
+from src.tools import ExperimentHandler
+
+
+def evaluate_experiment(file: str):
+    exp_handler = ExperimentHandler(file=file, method="statsforecast")
+    Y_df, freq, pandas_freq, h, seasonality = exp_handler.read_data()
+    models = [
+        SeasonalNaive(season_length=seasonality),
+        ZeroModel(),
+    ]
+    # even though statsforecast can handle multiple models, we only use one
+    # at a time to calculate time for each
+    eval_df = []
+    total_time_df = []
+    for model in models:
+        model_name = repr(model)
+        print(model_name)
+        sf = StatsForecast(
+            models=[model],
+            freq=pandas_freq,
+            n_jobs=-1,
+        )
+        start = time()
+        Y_hat_df_model = sf.cross_validation(
+            df=Y_df,
+            h=h,
+            n_windows=1,
+        ).reset_index()
+        total_time = time() - start
+        print(total_time)
+        # evaluation
+        eval_df_model, total_time_df_model = exp_handler.evaluate_model(
+            Y_hat_df=Y_hat_df_model,
+            model_name=model_name,
+            total_time=total_time,
+        )
+        eval_df.append(eval_df_model.set_index(["metric", "unique_id"]))
+        total_time_df.append(total_time_df_model)
+    eval_df = pd.concat(eval_df, axis=1).reset_index()
+    total_time_df = pd.concat(total_time_df)
+    exp_handler.save_results(
+        freq=freq,
+        eval_df=eval_df,
+        total_time_df=total_time_df,
+        df=Y_df,
+    )
+
+
+if __name__ == "__main__":
+    fire.Fire(evaluate_experiment)
--- a/experiments/prophet/src/timegpt_exp.py
+++ b/experiments/prophet/src/timegpt_exp.py
+import sys
+from time import time
+
+import fire
+from dotenv import load_dotenv
+from nixtla import NixtlaClient
+
+from src.tools import ExperimentHandler
+
+load_dotenv()
+
+
+def evaluate_experiment(file: str):
+    exp_handler = ExperimentHandler(file=file, method="timegpt")
+    model_name = "TimeGPT"
+    print(model_name)
+    # timegpt does not need the full history to
+    # make zero shot predictions
+    Y_df, freq, pandas_freq, h, seasonality = exp_handler.read_data(
+        max_insample_length=300
+    )
+    size_df = sys.getsizeof(Y_df) / (1024 * 1024)
+    max_partition_size_mb = 20
+    num_partitions = int(size_df / max_partition_size_mb) + 1
+    timegpt = NixtlaClient(
+        base_url="https://timegpt-endpoint.eastus.inference.ml.azure.com/",
+        max_retries=1,
+    )
+    start = time()
+    Y_hat_df = timegpt.cross_validation(
+        df=Y_df,
+        h=h,
+        n_windows=1,
+        freq=pandas_freq,
+        num_partitions=num_partitions,
+    )
+    total_time = time() - start
+    print(total_time)
+    # evaluation
+    eval_df, total_time_df = exp_handler.evaluate_model(
+        Y_hat_df=Y_hat_df,
+        model_name=model_name,
+        total_time=total_time,
+    )
+    exp_handler.save_results(
+        freq=freq,
+        eval_df=eval_df,
+        total_time_df=total_time_df,
+    )
+
+
+if __name__ == "__main__":
+    fire.Fire(evaluate_experiment)
--- a/experiments/prophet/src/tools.py
+++ b/experiments/prophet/src/tools.py
+import os
+from typing import Optional, Tuple
+
+import pandas as pd
+from utilsforecast.evaluation import evaluate
+from utilsforecast.losses import mae, rmse
+
+
+class ExperimentHandler:
+    def __init__(self, file: str, method: str):
+        self.file = file
+        self.method = method
+
+    @staticmethod
+    def get_parameter(parameter: str, df: pd.DataFrame):
+        parameter = df[parameter].unique()
+        if len(parameter) > 1:
+            raise ValueError(f"{parameter} is not unique: {parameter}")
+        return parameter[0]
+
+    def read_data(
+        self,
+        max_insample_length: int = 3_000,
+    ) -> Tuple[pd.DataFrame, str, str, int, int]:
+        df = pd.read_parquet(self.file)
+        Y_df = df[["unique_id", "ds", "y"]].drop_duplicates(["unique_id", "ds"])
+        Y_df = Y_df.sort_values(["unique_id", "ds"])
+        Y_df = Y_df.groupby("unique_id").tail(
+            max_insample_length
+        )  # take only last 3_000 rows
+        Y_df["ds"] = Y_df["ds"].str.replace(":01$", ":00", regex=True)
+        freq = self.get_parameter("frequency", df)
+        pandas_freq = self.get_parameter("pandas_frequency", df)
+        h = self.get_parameter("horizon", df)
+        seasonality = self.get_parameter("seasonality", df)
+        return Y_df, freq, pandas_freq, int(h), int(seasonality)
+
+    def evaluate_model(
+        self,
+        Y_hat_df: pd.DataFrame,
+        model_name: str,
+        total_time: float,
+    ):
+        if "cutoff" in Y_hat_df.columns:
+            Y_hat_df = Y_hat_df.drop(columns="cutoff")
+        eval_df = evaluate(
+            df=Y_hat_df,
+            metrics=[rmse, mae],
+        )
+        total_time_df = pd.DataFrame({"model": [model_name], "time": [total_time]})
+        return eval_df, total_time_df
+
+    @staticmethod
+    def summarize_df(df: pd.DataFrame):
+        n_unique_ids = df["unique_id"].nunique()
+        mean_y = df["y"].mean()
+        std_y = df["y"].std()
+        lengths = df.groupby("unique_id").size()
+        min_length = lengths.min()
+        max_length = lengths.max()
+        n_obs = len(df)
+        summary = {
+            "n_series": n_unique_ids,
+            "mean": mean_y,
+            "std": std_y,
+            "min_length": min_length,
+            "max_length": max_length,
+            "n_obs": n_obs,
+        }
+        summary_df = pd.DataFrame.from_dict(summary, orient="index")
+        summary_df = summary_df.transpose()
+        return summary_df
+
+    def save_results(
+        self,
+        freq: str,
+        eval_df: pd.DataFrame,
+        total_time_df: pd.DataFrame,
+        df: Optional[pd.DataFrame] = None,
+    ):
+        eval_df["frequency"] = freq
+        eval_df = eval_df.melt(
+            id_vars=["frequency", "metric", "unique_id"],
+            var_name="model",
+            value_name="value",
+        )
+        total_time_df["frequency"] = freq
+        dir = self.file.split("/")[-1].replace(".parquet", "")
+        dir = f"./data/results/{dir}"
+        os.makedirs(dir, exist_ok=True)
+        eval_df.to_parquet(
+            f"{dir}/{self.method}_metrics.parquet",
+            index=False,
+        )
+        total_time_df.to_parquet(
+            f"{dir}/{self.method}_time.parquet",
+            index=False,
+        )
+        if df is not None:
+            summary_df = self.summarize_df(df)
+            summary_df["frequency"] = freq
+            print(summary_df)
+            summary_df.to_parquet(
+                f"{dir}/series_summary.parquet",
+                index=False,
+            )
--- a/experiments/prophet/src/utils.py
+++ b/experiments/prophet/src/utils.py
+from concurrent.futures import ThreadPoolExecutor
+import pandas as pd
+
+
+def read_parquet_and_assign(uid, url):
+    df = pd.read_parquet(url)
+    df["unique_id"] = uid
+    df["ds"] = df["ds"].astype(str)
+    return df[["unique_id", "ds", "y"]]
+
+
+def download_data():
+    catalogue_splits = pd.read_parquet("./data/catalogue_splits.parquet")
+    catalogue_datasets = pd.read_parquet("./data/catalogue_datasets.parquet")
+    catalogue_df = catalogue_splits.merge(
+        catalogue_datasets,
+        on=["dataset", "subdataset", "frequency"],
+    )
+    del catalogue_splits
+    del catalogue_datasets
+    catalogue_df = catalogue_df.query("split == 'test'")[
+        [
+            "unique_id",
+            "frequency",
+            "url",
+            "pandas_frequency",
+            "seasonality",
+            "horizon",
+        ]
+    ]
+    grouped_df = catalogue_df.groupby(["frequency", "pandas_frequency"])
+    for (frequency, pandas_frequency), df in grouped_df:
+        uids, urls = df["unique_id"].values, df["url"].values
+        print(f"frequency: {frequency}, pandas_frequency: {pandas_frequency}")
+        print(f"number of uids: {len(uids)}")
+        with ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(read_parquet_and_assign, uid, url)
+                for uid, url in zip(uids, urls)
+            ]
+            results = [future.result() for future in futures]
+        print("dataset read")
+        Y_df = pd.concat(results)
+        Y_df = Y_df.merge(
+            df.drop(columns="url"),
+            on="unique_id",
+            how="left",
+        )
+        print(Y_df)
+        Y_df.to_parquet(f"./data/{frequency}_{pandas_frequency}.parquet")
+        del Y_df
+
+
+if __name__ == "__main__":
+    download_data()
--- a/experiments/salesforce-moirai/README.md
+++ b/experiments/salesforce-moirai/README.md
+# Salesforce's Moirai performs great in hourly data and is much faster than Chronos but is still up to 33% less accurate and less efficient than statistical models when considering monthly, weekly, and yearly data
+
+We present a comprehensive, reproducible evaluation demonstrating that a Statistical Ensemble—comprising AutoARIMA, AutoETS, AutoCES, and DynamicOptimizedTheta—substantially surpasses [Salesforce Moirai](https://github.com/SalesforceAIResearch/uni2ts), a foundational model for time series forecasting with over 311 million parameters. The **Statistical Ensemble achieves 33%, 33%, and 15% superior performance in CRPS, MASE, and SMAPE metrics, respectively**, across benchmark datasets including M1, M3, M4, and Tourism. A  **Simple Seasonal Naive achieves 17% and 0.5%, superior performance in MASE, and SMAPE metrics, respectively. However, Morai is 25% more accurate than a seasonal naive in terms of CRPS**.  Benchmark datasets include M1, M3, M4, and Tourism. 
+These datasets cover more than **100,000 unique time series**, offering a robust comparison of the models. Efficiency-wise, **Moirai is 3.5x faster than the Statistical Ensemble but 160x slower than a seasonal naive forecast**, marking a trade-off between speed and accuracy in different forecasting frequencies.
+
+
+# Introduction
+
+Following our recent [benchmark demonstrating Amazon Chronos's lesser accuracy and slower speed compared to classical statistical models](https://github.com/Nixtla/nixtla/tree/main/experiments/amazon-chronos), the community sought a similar analysis for Moirai. We commend the Salesforce AI team for releasing the first fully open-source foundational time series model, complete with weights, data, and code. Morai's accuracy shines with hourly data, a noteworthy achievement we're eager to highlight. Our acknowledgment extends to Salesforce for recognizing our prior contributions to this research field.
+
+Foundational models like Salesforce's Moirai signify a notable advance in time series forecasting, leveraging deep learning and extensive datasets for pre-training to enhance predictions. Despite Moirai's impressive parameter count (311 million) and scope, our findings suggest that traditional forecasting methods grouped into a Statistical Ensemble often outperform in accuracy. This benchmark continues our exploration of statistical versus deep learning models in forecasting.
+
+In our assessment, Salesforece's Moirai shows a more promising path than Amazon Chronos in handling hourly data, hinting at the potential to surpass classical statistical methods eventually.
+
+
+## Empirical Evaluation
+
+Expanding upon our prior work, this study evaluates over 100,000 unique time series from the M1, M3, M4, and Tourism datasets across various frequencies. Our analysis also benchmarks against the Seasonal Naive model, a staple in traditional forecasting methods.
+
+## Results
+
+The **Statistical Ensemble achieves 33%, 33%, and 15% superior performance in CRPS, MASE, and SMAPE metrics, respectively**, across benchmark datasets including M1, M3, M4, and Tourism. A  **Simple Seasonal Naive achieves 17% and 0.5%, superior performance in MASE, and SMAPE metrics, respectively. However, Morai is 25% more accurate than a seasonal naive in terms of CRPS**.
+
+Efficiency-wise, **Moirai is 3.5x faster than the Statistical Ensemble but 160x slower than a seasonal naive forecast**, marking a trade-off between speed and accuracy in different forecasting frequencies.
+
+It is critical to highlight that Morai may possess an unfair advantage over the statistical ensemble due to its training methodology. Specifically, Morai was trained using all the datasets that are currently being used for evaluation. In contrast, the statistical ensemble was not exposed to the test dataset during its training phase.
+
+![image (27)](https://github.com/Nixtla/nixtla-backup/assets/4086186/71cf04f5-a48d-455e-8508-a0c393beed6e)
+
+The complete code to replicate all results is available at [GitHub](https://github.com/Nixtla/nixtla/tree/main/experiments/salesforce-moirai). This study underscores statistical models' continued relevance and superiority in specific scenarios, challenging the assumption that foundational deep-learning models are always the best solution for time series forecasting.
+
+This revision integrates the comparative performance of the Statistical Ensemble and Salesforce's Moirai, highlighting key findings from your data. Please ensure to replace the placeholder for the new table image with an actual image link or embed the table directly if the platform supports LaTeX rendering.
+
+## Reproducibility
+
+To ensure the reproducibility of our findings, the Statistical Ensemble experiments were conducted on an AWS c5a.24xlarge instance, equipped with 96 vCPUs and 192 GiB of RAM. In contrast, the experiments for Salesforce Moirai were carried out on an AWS g5.4xlarge GPU instance, which includes 16 vCPUs, 64 GiB of RAM, and an NVIDIA A10G Tensor Core GPU with 24 GiB. All necessary code and detailed instructions for reproducing the experiments are available in this directory.
+
+### Instructions
+
+1. Set up a Python environment:
+   
+```bash
+mamba env create -f environment.yml
+conda activate moirai
+pip install git+https://github.com/SalesforceAIResearch/uni2ts.git
+```
+
+2. Run the experiments as reported in the table:
+   
+```bash
+python -m src.main --mode fcst_statsforecast
+python -m src.main --mode fcst_moirai
+```
+
+3. Evaluate the results using:
+
+```bash
+python -m src.main --mode evaluation
+```
+
+### References
+- **Statistical Ensemble Paper**: [A Simple Combination of Univariate Models](https://www.sciencedirect.com/science/article/abs/pii/S0169207019300585?via%3Dihub)
+- **Salesforce Moirai Paper**: [nified Training of Universal Time Series Forecasting Transformers](https://arxiv.org/abs/2402.02592)
--- a/experiments/salesforce-moirai/environment.yml
+++ b/experiments/salesforce-moirai/environment.yml
+name: moirai
+channels:
+  - conda-forge
+  - defaults
+  - anaconda
+dependencies:
+  - jupyterlab
+  - pip
+  - python=3.10
+  - pip:
+    - datasetsforecast
+    - fire
+    - huggingface_hub[cli]
+    - neuralforecast
+    - orjson
+    - statsforecast
+    - utilsforecast
+
--- a/experiments/salesforce-moirai/src/main.py
+++ b/experiments/salesforce-moirai/src/main.py
+import logging
+import subprocess
+from typing import Literal
+
+import fire
+import pandas as pd
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+datasets = [
+    "m1_yearly",
+    "m1_quarterly",
+    "m1_monthly",
+    "m3_yearly",
+    "m3_quarterly",
+    "m3_monthly",
+    "m3_other",
+    "m4_yearly",
+    "m4_quarterly",
+    "m4_monthly",
+    "m4_weekly",
+    "m4_daily",
+    "m4_hourly",
+    "tourism_yearly",
+    "tourism_quarterly",
+    "tourism_monthly",
+]
+
+
+def main(mode: Literal["fcst_statsforecast", "fcst_moirai`", "evaluation"]):
+    prefix_process = ["python", "-m"]
+
+    if mode in ["fcst_statsforecast", "fcst_moirai"]:
+        for dataset in datasets:
+            logger.info(f"Forecasting {dataset}...")
+            suffix_process = ["--dataset", dataset]
+
+            def process(middle_process):
+                return prefix_process + middle_process + suffix_process
+
+            if mode == "fcst_statsforecast":
+                logger.info("Running StatisticalEnsemble")
+                subprocess.run(process(["src.statsforecast_pipeline"]))
+            elif mode == "fcst_moirai":
+                logger.info("Running SalesforceMoirai")
+                subprocess.run(process(["src.moirai_pipeline"]))
+    elif mode == "evaluation":
+        from src.utils import ExperimentHandler
+
+        eval_df = []
+        for dataset in datasets:
+            logger.info(f"Evaluating {dataset}...")
+            exp = ExperimentHandler(dataset)
+            try:
+                eval_dataset_df = exp.evaluate_models(
+                    [
+                        "SalesforceMoirai",
+                        "StatisticalEnsemble",
+                        "SeasonalNaive",
+                    ]
+                )
+                print(eval_dataset_df)
+                eval_df.append(eval_dataset_df)
+            except Exception as e:
+                logger.error(e)
+        eval_df = pd.concat(eval_df).reset_index(drop=True)
+        exp.save_dataframe(eval_df, "complete-results.csv")
+    else:
+        raise ValueError(f"mode {mode} not found")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)