Commit f42429f6 authored by bailuo's avatar bailuo
Browse files

readme

parents
import pytest
try:
import dask.dataframe as dd
from dask.distributed import Client
@pytest.fixture(scope="module")
def dask_client():
with Client() as client:
yield client
@pytest.fixture(scope="module")
def dask_df(distributed_series):
return dd.from_pandas(distributed_series, npartitions=2)
@pytest.fixture(scope="module")
def dask_diff_cols_df(distributed_series, renamer):
return dd.from_pandas(
distributed_series.rename(columns=renamer),
npartitions=2,
)
@pytest.fixture(scope="module")
def dask_df_x(distributed_df_x):
return dd.from_pandas(distributed_df_x, npartitions=2)
@pytest.fixture(scope="module")
def dask_future_ex_vars_df(distributed_future_ex_vars_df):
return dd.from_pandas(distributed_future_ex_vars_df, npartitions=2)
@pytest.fixture(scope="module")
def dask_df_x_diff_cols(distributed_df_x, renamer):
return dd.from_pandas(distributed_df_x.rename(columns=renamer), npartitions=2)
@pytest.fixture(scope="module")
def dask_future_ex_vars_df_diff_cols(distributed_future_ex_vars_df, renamer):
return dd.from_pandas(
distributed_future_ex_vars_df.rename(columns=renamer), npartitions=2
)
except ImportError:
# If Dask is not installed, we skip the fixtures
pytest.skip(
"Dask is not installed, skipping Dask fixtures", allow_module_level=True
)
import pytest
try:
import ray
from ray.cluster_utils import Cluster
@pytest.fixture(scope="module")
def ray_cluster_setup():
ray_cluster = Cluster(initialize_head=True, head_node_args={"num_cpus": 2})
with ray.init(address=ray_cluster.address, ignore_reinit_error=True):
# add mock node to simulate a cluster
ray_cluster.add_node(num_cpus=2)
yield
@pytest.fixture(scope="module")
def ray_df(distributed_series):
return ray.data.from_pandas(distributed_series)
@pytest.fixture(scope="module")
def ray_diff_cols_df(distributed_series, renamer):
return ray.data.from_pandas(distributed_series.rename(columns=renamer))
@pytest.fixture(scope="module")
def ray_df_x(distributed_df_x):
return ray.data.from_pandas(distributed_df_x)
@pytest.fixture(scope="module")
def ray_future_ex_vars_df(distributed_future_ex_vars_df):
return ray.data.from_pandas(distributed_future_ex_vars_df)
@pytest.fixture(scope="module")
def ray_df_x_diff_cols(distributed_df_x, renamer):
return ray.data.from_pandas(distributed_df_x.rename(columns=renamer))
@pytest.fixture(scope="module")
def ray_future_ex_vars_df_diff_cols(distributed_future_ex_vars_df, renamer):
return ray.data.from_pandas(
distributed_future_ex_vars_df.rename(columns=renamer)
)
except ImportError:
# If Ray is not installed, we skip the fixtures
pytest.skip("Ray is not installed, skipping Ray fixtures", allow_module_level=True)
import pytest
try:
from pyspark.sql import SparkSession
@pytest.fixture(scope="module")
def spark_client():
with SparkSession.builder.getOrCreate() as spark:
yield spark
@pytest.fixture(scope="module")
def spark_df(spark_client, distributed_series):
spark_df = spark_client.createDataFrame(distributed_series).repartition(2)
return spark_df
@pytest.fixture(scope="module")
def spark_diff_cols_df(spark_client, distributed_series, renamer):
spark_df = spark_client.createDataFrame(
distributed_series.rename(columns=renamer)
).repartition(2)
return spark_df
@pytest.fixture(scope="module")
def spark_df_x(spark_client, distributed_df_x):
spark_df = spark_client.createDataFrame(distributed_df_x).repartition(2)
return spark_df
@pytest.fixture(scope="module")
def spark_df_x_diff_cols(spark_client, distributed_df_x, renamer):
spark_df = spark_client.createDataFrame(
distributed_df_x.rename(columns=renamer)
).repartition(2)
return spark_df
@pytest.fixture(scope="module")
def spark_future_ex_vars_df(spark_client, distributed_future_ex_vars_df):
spark_df = spark_client.createDataFrame(
distributed_future_ex_vars_df
).repartition(2)
return spark_df
@pytest.fixture(scope="module")
def spark_future_ex_vars_df_diff_cols(
spark_client, distributed_future_ex_vars_df, renamer
):
spark_df = spark_client.createDataFrame(
distributed_future_ex_vars_df.rename(columns=renamer)
).repartition(2)
return spark_df
except ImportError:
# If PySpark is not installed, we skip the fixtures
pytest.skip(
"PySpark is not installed, skipping Spark fixtures", allow_module_level=True
)
import fugue
import fugue.api as fa
import numpy as np
import pandas as pd
import pytest
import time
from nixtla.nixtla_client import NixtlaClient
from typing import Callable
# setting used for distributed related tests
ATOL = 1e-3
# test num partitions
# we need to be sure that we can recover the same results
# using a for loop
# A: be aware that num partitons can produce different results
# when used finetune_steps
def check_num_partitions_same_results(method: Callable, num_partitions: int, **kwargs):
res_partitioned = method(**kwargs, num_partitions=num_partitions)
res_no_partitioned = method(**kwargs, num_partitions=1)
sort_by = ["unique_id", "ds"]
if "cutoff" in res_partitioned:
sort_by.extend(["cutoff"])
pd.testing.assert_frame_equal(
res_partitioned.sort_values(sort_by).reset_index(drop=True),
res_no_partitioned.sort_values(sort_by).reset_index(drop=True),
rtol=1e-2,
atol=1e-2,
)
def check_retry_behavior(
df,
side_effect,
side_effect_exception,
max_retries=5,
retry_interval=5,
max_wait_time=40,
should_retry=True,
sleep_seconds=5,
):
mock_nixtla_client = NixtlaClient(
max_retries=max_retries,
retry_interval=retry_interval,
max_wait_time=max_wait_time,
)
mock_nixtla_client._make_request = side_effect
init_time = time.time()
with pytest.raises(side_effect_exception):
mock_nixtla_client.forecast(
df=df, h=12, time_col="timestamp", target_col="value"
)
total_mock_time = time.time() - init_time
if should_retry:
approx_expected_time = min((max_retries - 1) * retry_interval, max_wait_time)
upper_expected_time = min(max_retries * retry_interval, max_wait_time)
assert total_mock_time >= approx_expected_time, "It is not retrying as expected"
# preprocessing time before the first api call should be less than 60 seconds
assert (
total_mock_time - upper_expected_time - (max_retries - 1) * sleep_seconds
<= sleep_seconds
)
else:
assert total_mock_time <= max_wait_time
# test we recover the same <mean> forecasts
# with and without restricting input
# (add_history)
def check_equal_fcsts_add_history(nixtla_client, **kwargs):
fcst_no_rest_df = nixtla_client.forecast(**kwargs, add_history=True)
fcst_no_rest_df = (
fcst_no_rest_df.groupby("unique_id", observed=True)
.tail(kwargs["h"])
.reset_index(drop=True)
)
fcst_rest_df = nixtla_client.forecast(**kwargs)
pd.testing.assert_frame_equal(
fcst_no_rest_df,
fcst_rest_df,
atol=1e-4,
rtol=1e-3,
)
return fcst_rest_df
def check_quantiles(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
id_col: str = "id_col",
time_col: str = "time_col",
):
test_qls = list(np.arange(0.1, 1, 0.1))
exp_q_cols = [f"TimeGPT-q-{int(q * 100)}" for q in test_qls]
def test_method_qls(method, **kwargs):
df_qls = method(
df=df, h=12, id_col=id_col, time_col=time_col, quantiles=test_qls, **kwargs
)
df_qls = fa.as_pandas(df_qls)
assert all(col in df_qls.columns for col in exp_q_cols)
# test monotonicity of quantiles
df_qls.apply(lambda x: x.is_monotonic_increasing, axis=1).sum() == len(
exp_q_cols
)
test_method_qls(nixtla_client.forecast)
test_method_qls(nixtla_client.forecast, add_history=True)
test_method_qls(nixtla_client.cross_validation)
def check_cv_same_results_num_partitions(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
horizon: int = 12,
id_col: str = "unique_id",
time_col: str = "ds",
**fcst_kwargs,
):
fcst_df = nixtla_client.cross_validation(
df=df,
h=horizon,
num_partitions=1,
id_col=id_col,
time_col=time_col,
**fcst_kwargs,
)
fcst_df = fa.as_pandas(fcst_df)
fcst_df_2 = nixtla_client.cross_validation(
df=df,
h=horizon,
num_partitions=2,
id_col=id_col,
time_col=time_col,
**fcst_kwargs,
)
fcst_df_2 = fa.as_pandas(fcst_df_2)
pd.testing.assert_frame_equal(
fcst_df.sort_values([id_col, time_col]).reset_index(drop=True),
fcst_df_2.sort_values([id_col, time_col]).reset_index(drop=True),
atol=ATOL,
)
def check_forecast_diff_results_diff_models(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
horizon: int = 12,
id_col: str = "unique_id",
time_col: str = "ds",
**fcst_kwargs,
):
fcst_df = nixtla_client.forecast(
df=df,
h=horizon,
num_partitions=1,
id_col=id_col,
time_col=time_col,
model="timegpt-1",
**fcst_kwargs,
)
fcst_df = fa.as_pandas(fcst_df)
fcst_df_2 = nixtla_client.forecast(
df=df,
h=horizon,
num_partitions=1,
id_col=id_col,
time_col=time_col,
model="timegpt-1-long-horizon",
**fcst_kwargs,
)
fcst_df_2 = fa.as_pandas(fcst_df_2)
with pytest.raises(
AssertionError, match=r'\(column name="TimeGPT"\) are different'
):
pd.testing.assert_frame_equal(
fcst_df.sort_values([id_col, time_col]).reset_index(drop=True),
fcst_df_2.sort_values([id_col, time_col]).reset_index(drop=True),
)
def check_forecast(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
horizon: int = 12,
id_col: str = "unique_id",
time_col: str = "ds",
n_series_to_check: int = 4,
**fcst_kwargs,
):
fcst_df = nixtla_client.forecast(
df=df,
h=horizon,
id_col=id_col,
time_col=time_col,
**fcst_kwargs,
)
fcst_df = fa.as_pandas(fcst_df)
assert n_series_to_check * 12 == len(fcst_df)
cols = fcst_df.columns.to_list()
exp_cols = [id_col, time_col, "TimeGPT"]
if "level" in fcst_kwargs:
level = sorted(fcst_kwargs["level"])
exp_cols.extend([f"TimeGPT-lo-{lv}" for lv in reversed(level)])
exp_cols.extend([f"TimeGPT-hi-{lv}" for lv in level])
assert cols == exp_cols
def check_forecast_same_results_num_partitions(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
horizon: int = 12,
id_col: str = "unique_id",
time_col: str = "ds",
**fcst_kwargs,
):
fcst_df = nixtla_client.forecast(
df=df,
h=horizon,
num_partitions=1,
id_col=id_col,
time_col=time_col,
**fcst_kwargs,
)
fcst_df = fa.as_pandas(fcst_df)
fcst_df_2 = nixtla_client.forecast(
df=df,
h=horizon,
num_partitions=2,
id_col=id_col,
time_col=time_col,
**fcst_kwargs,
)
fcst_df_2 = fa.as_pandas(fcst_df_2)
pd.testing.assert_frame_equal(
fcst_df.sort_values([id_col, time_col]).reset_index(drop=True),
fcst_df_2.sort_values([id_col, time_col]).reset_index(drop=True),
atol=ATOL,
)
def check_forecast_dataframe(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
n_series_to_check: int = 4,
):
check_cv_same_results_num_partitions(nixtla_client, df, n_windows=2, step_size=1)
check_cv_same_results_num_partitions(
nixtla_client, df, n_windows=3, step_size=None, horizon=1
)
check_cv_same_results_num_partitions(
nixtla_client, df, model="timegpt-1-long-horizon", horizon=1
)
check_forecast_diff_results_diff_models(nixtla_client, df)
check_forecast(nixtla_client, df, num_partitions=1)
check_forecast(
nixtla_client,
df,
level=[90, 80],
num_partitions=1,
n_series_to_check=n_series_to_check,
)
check_forecast_same_results_num_partitions(nixtla_client, df)
def check_forecast_dataframe_diff_cols(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
id_col: str = "id_col",
time_col: str = "time_col",
target_col: str = "target_col",
):
check_forecast(
nixtla_client,
df,
id_col=id_col,
time_col=time_col,
target_col=target_col,
num_partitions=1,
)
check_forecast(
nixtla_client,
df,
id_col=id_col,
time_col=time_col,
target_col=target_col,
level=[90, 80],
num_partitions=1,
)
check_forecast_same_results_num_partitions(
nixtla_client, df, id_col=id_col, time_col=time_col, target_col=target_col
)
def check_anomalies(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
id_col: str = "unique_id",
time_col: str = "ds",
target_col: str = "y",
**anomalies_kwargs,
):
anomalies_df = nixtla_client.detect_anomalies(
df=df,
id_col=id_col,
time_col=time_col,
target_col=target_col,
**anomalies_kwargs,
)
anomalies_df = fa.as_pandas(anomalies_df)
assert (fa.as_pandas(df)[id_col].unique() == anomalies_df[id_col].unique()).all()
cols = anomalies_df.columns.to_list()
level = anomalies_kwargs.get("level", 99)
exp_cols = [
id_col,
time_col,
target_col,
"TimeGPT",
"anomaly",
f"TimeGPT-lo-{level}",
f"TimeGPT-hi-{level}",
]
assert cols == exp_cols
def check_anomalies_same_results_num_partitions(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
id_col: str = "unique_id",
time_col: str = "ds",
target_col: str = "y",
**anomalies_kwargs,
):
anomalies_df = nixtla_client.detect_anomalies(
df=df,
num_partitions=1,
id_col=id_col,
time_col=time_col,
target_col=target_col,
**anomalies_kwargs,
)
anomalies_df = fa.as_pandas(anomalies_df)
anomalies_df_2 = nixtla_client.detect_anomalies(
df=df,
num_partitions=2,
id_col=id_col,
time_col=time_col,
target_col=target_col,
**anomalies_kwargs,
)
anomalies_df_2 = fa.as_pandas(anomalies_df_2)
pd.testing.assert_frame_equal(
anomalies_df.sort_values([id_col, time_col]).reset_index(drop=True),
anomalies_df_2.sort_values([id_col, time_col]).reset_index(drop=True),
atol=ATOL,
)
def check_anomalies_dataframe(nixtla_client: NixtlaClient, df: fugue.AnyDataFrame):
check_anomalies(nixtla_client, df, num_partitions=1)
check_anomalies(nixtla_client, df, level=90, num_partitions=1)
check_anomalies_same_results_num_partitions(nixtla_client, df)
def check_online_anomalies(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
id_col: str = "unique_id",
time_col: str = "ds",
target_col: str = "y",
level=99,
**reatlime_anomalies_kwargs,
):
anomalies_df = nixtla_client.detect_anomalies_online(
df=df,
id_col=id_col,
time_col=time_col,
target_col=target_col,
**reatlime_anomalies_kwargs,
)
anomalies_df = fa.as_pandas(anomalies_df)
assert (fa.as_pandas(df)[id_col].unique() == anomalies_df[id_col].unique()).all()
cols = anomalies_df.columns.to_list()
exp_cols = [
id_col,
time_col,
target_col,
"TimeGPT",
"anomaly",
"anomaly_score",
f"TimeGPT-lo-{level}",
f"TimeGPT-hi-{level}",
]
assert cols == exp_cols
def check_anomalies_online_same_results_num_partitions(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
id_col: str = "unique_id",
time_col: str = "ds",
target_col: str = "y",
**reatlime_anomalies_kwargs,
):
anomalies_df = nixtla_client.detect_anomalies_online(
df=df,
id_col=id_col,
time_col=time_col,
target_col=target_col,
num_partitions=1,
**reatlime_anomalies_kwargs,
)
anomalies_df = fa.as_pandas(anomalies_df)
anomalies_df_2 = nixtla_client.detect_anomalies_online(
df=df,
id_col=id_col,
time_col=time_col,
target_col=target_col,
num_partitions=2,
**reatlime_anomalies_kwargs,
)
anomalies_df_2 = fa.as_pandas(anomalies_df_2)
pd.testing.assert_frame_equal(
anomalies_df.sort_values([id_col, time_col]).reset_index(drop=True),
anomalies_df_2.sort_values([id_col, time_col]).reset_index(drop=True),
atol=ATOL,
)
def check_anomalies_online_dataframe(
nixtla_client: NixtlaClient, df: fugue.AnyDataFrame
):
check_online_anomalies(
nixtla_client,
df,
h=20,
detection_size=5,
threshold_method="univariate",
level=99,
num_partitions=1,
)
check_anomalies_online_same_results_num_partitions(
nixtla_client,
df,
h=20,
detection_size=5,
threshold_method="univariate",
level=99,
)
def check_anomalies_dataframe_diff_cols(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
id_col: str = "id_col",
time_col: str = "time_col",
target_col: str = "target_col",
):
check_anomalies(
nixtla_client,
df,
id_col=id_col,
time_col=time_col,
target_col=target_col,
num_partitions=1,
)
check_anomalies(
nixtla_client,
df,
id_col=id_col,
time_col=time_col,
target_col=target_col,
level=90,
num_partitions=1,
)
check_anomalies_same_results_num_partitions(
nixtla_client, df, id_col=id_col, time_col=time_col, target_col=target_col
)
def check_forecast_x(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
X_df: fugue.AnyDataFrame,
horizon: int = 24,
id_col: str = "unique_id",
time_col: str = "ds",
target_col: str = "y",
**fcst_kwargs,
):
fcst_df = nixtla_client.forecast(
df=df,
X_df=X_df,
h=horizon,
id_col=id_col,
time_col=time_col,
target_col=target_col,
**fcst_kwargs,
)
fcst_df = fa.as_pandas(fcst_df)
n_series = fa.as_pandas(X_df)[id_col].nunique()
assert n_series * horizon == len(fcst_df)
cols = fcst_df.columns.to_list()
exp_cols = [id_col, time_col, "TimeGPT"]
if "level" in fcst_kwargs:
level = sorted(fcst_kwargs["level"])
exp_cols.extend([f"TimeGPT-lo-{lv}" for lv in reversed(level)])
exp_cols.extend([f"TimeGPT-hi-{lv}" for lv in level])
assert cols == exp_cols
fcst_df_2 = nixtla_client.forecast(
df=df,
h=horizon,
id_col=id_col,
time_col=time_col,
target_col=target_col,
**fcst_kwargs,
)
fcst_df_2 = fa.as_pandas(fcst_df_2)
equal_arrays = np.array_equal(
fcst_df.sort_values([id_col, time_col])["TimeGPT"].values,
fcst_df_2.sort_values([id_col, time_col])["TimeGPT"].values,
)
assert not equal_arrays, "Forecasts with and without ex vars are equal"
def check_forecast_x_same_results_num_partitions(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
X_df: fugue.AnyDataFrame,
horizon: int = 24,
id_col: str = "unique_id",
time_col: str = "ds",
target_col: str = "y",
**fcst_kwargs,
):
fcst_df = nixtla_client.forecast(
df=df,
X_df=X_df,
h=horizon,
num_partitions=1,
id_col=id_col,
time_col=time_col,
target_col=target_col,
**fcst_kwargs,
)
fcst_df = fa.as_pandas(fcst_df)
fcst_df_2 = nixtla_client.forecast(
df=df,
h=horizon,
num_partitions=2,
id_col=id_col,
time_col=time_col,
target_col=target_col,
**fcst_kwargs,
)
fcst_df_2 = fa.as_pandas(fcst_df_2)
equal_arrays = np.array_equal(
fcst_df.sort_values([id_col, time_col])["TimeGPT"].values,
fcst_df_2.sort_values([id_col, time_col])["TimeGPT"].values,
)
assert not equal_arrays, "Forecasts with and without ex vars are equal"
def check_forecast_x_dataframe(
nixtla_client: NixtlaClient, df: fugue.AnyDataFrame, X_df: fugue.AnyDataFrame
):
check_forecast_x(nixtla_client, df, X_df, num_partitions=1)
check_forecast_x(nixtla_client, df, X_df, level=[90, 80], num_partitions=1)
check_forecast_x_same_results_num_partitions(nixtla_client, df, X_df)
def check_forecast_x_dataframe_diff_cols(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
X_df: fugue.AnyDataFrame,
id_col: str = "id_col",
time_col: str = "time_col",
target_col: str = "target_col",
):
check_forecast_x(
nixtla_client,
df,
X_df,
id_col=id_col,
time_col=time_col,
target_col=target_col,
num_partitions=1,
)
check_forecast_x(
nixtla_client,
df,
X_df,
id_col=id_col,
time_col=time_col,
target_col=target_col,
level=[90, 80],
num_partitions=1,
)
check_forecast_x_same_results_num_partitions(
nixtla_client, df, X_df, id_col=id_col, time_col=time_col, target_col=target_col
)
def check_finetuned_model(
nixtla_client: NixtlaClient,
df: fugue.AnyDataFrame,
model_id2: str,
):
# fine-tuning on distributed fails
with pytest.raises(
ValueError, match="Can only fine-tune on pandas or polars dataframes."
):
nixtla_client.finetune(df=df)
# forecast
local_fcst = nixtla_client.forecast(
df=fa.as_pandas(df), h=5, finetuned_model_id=model_id2,
)
distr_fcst = (
fa.as_pandas(nixtla_client.forecast(df=df, h=5, finetuned_model_id=model_id2))
.sort_values(["unique_id", "ds"])
.reset_index(drop=True)
)
pd.testing.assert_frame_equal(
local_fcst,
distr_fcst,
check_dtype=False,
atol=1e-4,
rtol=1e-2,
)
# cross-validation
local_cv = nixtla_client.cross_validation(
df=fa.as_pandas(df), n_windows=2, h=5, finetuned_model_id=model_id2
)
distr_cv = (
fa.as_pandas(
nixtla_client.cross_validation(
df=df, n_windows=2, h=5, finetuned_model_id=model_id2
)
)
.sort_values(["unique_id", "ds"])
.reset_index(drop=True)
)
pd.testing.assert_frame_equal(
local_cv,
distr_cv[local_cv.columns],
check_dtype=False,
atol=1e-4,
rtol=1e-2,
)
# anomaly detection
local_anomaly = nixtla_client.detect_anomalies(
df=fa.as_pandas(df), finetuned_model_id=model_id2
)
distr_anomaly = (
fa.as_pandas(
nixtla_client.detect_anomalies(df=df, finetuned_model_id=model_id2)
)
.sort_values(["unique_id", "ds"])
.reset_index(drop=True)
)
pd.testing.assert_frame_equal(
local_anomaly,
distr_anomaly[local_anomaly.columns],
check_dtype=False,
atol=1e-3,
rtol=1e-2,
)
import os
from contextlib import contextmanager
@contextmanager
def delete_env_var(key):
original_value = os.environ.get(key)
rm = False
if key in os.environ:
del os.environ[key]
rm = True
try:
yield
finally:
if rm:
os.environ[key] = original_value
import uuid
class ModelIds:
model_id1 = str(uuid.uuid4())
model_id2 = None
model_ids_object = ModelIds()
import pandas as pd
import pytest
def test_audit_data_all_pass(custom_client, df_ok, common_kwargs):
all_pass, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_ok, **common_kwargs
)
assert all_pass
assert len(fail_dfs) == 0
assert len(case_specific_dfs) == 0
def test_audit_data_with_duplicates(
custom_client, df_with_duplicates_set2, common_kwargs
):
all_pass, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_with_duplicates_set2, **common_kwargs
)
assert not all_pass
assert len(case_specific_dfs) == 0
assert len(fail_dfs) == 2
assert "D001" in fail_dfs
# The two duplicate rows should be returned
assert len(fail_dfs["D001"]) == 2
assert "D002" in fail_dfs
## D002 can not be run with duplicates
assert fail_dfs["D002"] is None
def test_clean_data_with_duplicates(
custom_client, df_with_duplicates_set2, common_kwargs
):
all_pass, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_with_duplicates_set2, **common_kwargs
)
cleaned_df, all_pass, fail_dfs, case_specific_dfs = custom_client.clean_data(
df=df_with_duplicates_set2,
fail_dict=fail_dfs,
case_specific_dict=case_specific_dfs,
agg_dict={"y": "sum"},
**common_kwargs
)
assert all_pass
assert len(fail_dfs) == 0
assert len(case_specific_dfs) == 0
assert len(cleaned_df) == 3
def test_clean_data_raises_valueerror(
custom_client, df_with_duplicates_set2, common_kwargs
):
_, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_with_duplicates_set2, **common_kwargs
)
with pytest.raises(
ValueError, match="agg_dict must be provided to resolve D001 failure."
):
custom_client.clean_data(
df=df_with_duplicates_set2,
fail_dict=fail_dfs,
case_specific_dict=case_specific_dfs,
**common_kwargs
)
def test_audit_data_with_missing_dates(
custom_client, df_with_missing_dates, common_kwargs
):
all_pass, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_with_missing_dates, **common_kwargs
)
assert not all_pass
assert len(case_specific_dfs) == 0
assert len(fail_dfs) == 1
assert "D002" in fail_dfs
assert len(fail_dfs["D002"]) == 2 # Two missing dates should be returned
def test_clean_data_with_missing_dates(
custom_client, df_with_missing_dates, common_kwargs
):
# First audit to get fail_dfs and case_specific_dfs
_, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_with_missing_dates, **common_kwargs
)
cleaned_df, all_pass, fail_dfs, case_specific_dfs = custom_client.clean_data(
df=df_with_missing_dates,
fail_dict=fail_dfs,
case_specific_dict=case_specific_dfs,
agg_dict={"y": "sum"},
**common_kwargs
)
assert all_pass
assert len(fail_dfs) == 0
assert len(case_specific_dfs) == 0
assert len(cleaned_df) == 6 # Two missing rows added.
assert pd.to_datetime("2023-01-02") in pd.to_datetime(cleaned_df["ds"]).values
def test_audit_data_with_duplicates_and_missing_dates(
custom_client, df_with_duplicates_and_missing_dates, common_kwargs
):
all_pass, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_with_duplicates_and_missing_dates, **common_kwargs
)
assert not all_pass
assert len(case_specific_dfs) == 0
assert len(fail_dfs) == 2
assert "D001" in fail_dfs
assert len(fail_dfs["D001"]) == 2 # The two duplicate rows should be returned
assert "D002" in fail_dfs
assert fail_dfs["D002"] is None # D002 can not be run with duplicates
def test_clean_data_with_duplicates_and_missing_dates(
custom_client, df_with_duplicates_and_missing_dates, common_kwargs
):
# First audit to get fail_dfs and case_specific_dfs
_, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_with_duplicates_and_missing_dates, **common_kwargs
)
# Clean Data (pass 1 will clear the duplicates)
cleaned_df, all_pass, fail_dfs, case_specific_dfs = custom_client.clean_data(
df=df_with_duplicates_and_missing_dates,
fail_dict=fail_dfs,
case_specific_dict=case_specific_dfs,
agg_dict={"y": "sum"},
**common_kwargs
)
assert not all_pass
assert len(fail_dfs) == 1
# Since duplicates have been removed, D002 has been run now.
assert "D002" in fail_dfs
assert len(fail_dfs["D002"]) == 1
assert len(case_specific_dfs) == 0
assert len(cleaned_df) == 4 # Two duplicates rows consolidated into one.
# Clean Data (pass 2 will clear the missing dates)
cleaned_df, all_pass, fail_dfs, case_specific_dfs = custom_client.clean_data(
df=cleaned_df,
fail_dict=fail_dfs,
case_specific_dict=case_specific_dfs,
**common_kwargs
)
assert all_pass
assert len(fail_dfs) == 0
assert len(case_specific_dfs) == 0
# Two duplicates rows consolidated into one plus one missing row added.
assert len(cleaned_df) == 5
def test_audit_data_with_cat_columns(custom_client, df_with_cat_columns, common_kwargs):
all_pass, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_with_cat_columns, **common_kwargs
)
assert not all_pass
assert len(case_specific_dfs) == 0
assert len(fail_dfs) == 1
assert "F001" in fail_dfs
assert fail_dfs["F001"].shape[1] == 2 # Should return both categorical columns
def test_audit_data_with_negative_vals(custom_client, df_negative_vals, common_kwargs):
all_pass, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_negative_vals, **common_kwargs
)
assert not all_pass
assert len(fail_dfs) == 0
assert len(case_specific_dfs) == 1
assert "V001" in case_specific_dfs
assert case_specific_dfs["V001"].shape[0] == 3 # should return all negative values
def test_clean_data_with_negative_vals_without_cleaning_case_specific(
custom_client, df_negative_vals, common_kwargs
):
_, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_negative_vals, **common_kwargs
)
_, all_pass, fail_dfs, case_specific_dfs = custom_client.clean_data(
df=df_negative_vals,
fail_dict=fail_dfs,
case_specific_dict=case_specific_dfs,
# clean_case_specific=False, # Default
**common_kwargs
)
assert not all_pass
assert len(fail_dfs) == 0
assert len(case_specific_dfs) == 1
assert "V001" in case_specific_dfs
assert case_specific_dfs["V001"].shape[0] == 3 # should return all negative values
def test_clean_data_with_negative_vals_cleaning_case_specific(
custom_client, df_negative_vals, common_kwargs
):
_, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_negative_vals, **common_kwargs
)
cleaned_df, all_pass, fail_dfs, case_specific_dfs = custom_client.clean_data(
df=df_negative_vals,
fail_dict=fail_dfs,
case_specific_dict=case_specific_dfs,
clean_case_specific=True,
**common_kwargs
)
assert not all_pass
assert len(fail_dfs) == 0
assert len(case_specific_dfs) == 1
assert "V002" in case_specific_dfs
assert case_specific_dfs["V002"].shape[0] == 1 # should return leading zeros
# test second pass
# Clean Data, second pass (removes leading zeros)
cleaned_df, all_pass, fail_dfs, case_specific_dfs = custom_client.clean_data(
df=cleaned_df,
fail_dict=fail_dfs,
case_specific_dict=case_specific_dfs,
clean_case_specific=True,
**common_kwargs
)
assert all_pass
assert len(fail_dfs) == 0
assert len(case_specific_dfs) == 0
def test_audit_data_leading_zeros(custom_client, common_kwargs, df_leading_zeros_set2):
all_pass, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_leading_zeros_set2, **common_kwargs
)
assert not all_pass
assert len(fail_dfs) == 0
assert len(case_specific_dfs) == 1
assert "V002" in case_specific_dfs
assert (
case_specific_dfs["V002"].shape[0] == 2
) # should return ids with leading zeros
def test_clean_data_leading_zeroes_without_cleaning_case_specific(
custom_client, common_kwargs, df_leading_zeros_set2
):
_, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_leading_zeros_set2, **common_kwargs
)
_, all_pass, fail_dfs, case_specific_dfs = custom_client.clean_data(
df=df_leading_zeros_set2,
fail_dict=fail_dfs,
case_specific_dict=case_specific_dfs,
# clean_case_specific=False, # Default
**common_kwargs
)
assert not all_pass
assert len(fail_dfs) == 0
assert len(case_specific_dfs) == 1
assert "V002" in case_specific_dfs
assert (
case_specific_dfs["V002"].shape[0] == 2
) # should return ids with leading zeros
def test_clean_data_with_cleaning_case_specific(
custom_client, common_kwargs, df_leading_zeros_set2
):
_, fail_dfs, case_specific_dfs = custom_client.audit_data(
df=df_leading_zeros_set2, **common_kwargs
)
cleaned_df, all_pass, fail_dfs, case_specific_dfs = custom_client.clean_data(
df=df_leading_zeros_set2,
fail_dict=fail_dfs,
case_specific_dict=case_specific_dfs,
clean_case_specific=True,
**common_kwargs
)
assert all_pass
assert len(fail_dfs) == 0
assert len(case_specific_dfs) == 0
assert len(cleaned_df) == 7 # all leading zeros removed, zero series unchanged
import os
import pytest
import pandas as pd
import warnings
from nixtla_tests.helpers.client_helper import delete_env_var
from nixtla.nixtla_client import NixtlaClient
def test_custom_business_hours(
business_hours_series, custom_business_hours
):
nixtla_test_client = NixtlaClient()
nixtla_test_client.detect_anomalies(
df=business_hours_series, freq=custom_business_hours, level=90
)
nixtla_test_client.cross_validation(
df=business_hours_series, freq=custom_business_hours, h=7
)
fcst = nixtla_test_client.forecast(
df=business_hours_series, freq=custom_business_hours, h=7
)
assert sorted(fcst["ds"].dt.hour.unique().tolist()) == list(range(9, 16))
assert [
(model, freq.lower())
for (model, freq) in nixtla_test_client._model_params.keys()
] == [("timegpt-1", "cbh")]
def test_integer_freq(integer_freq_series):
nixtla_test_client = NixtlaClient()
nixtla_test_client.detect_anomalies(df=integer_freq_series, level=90, freq=1)
nixtla_test_client.cross_validation(df=integer_freq_series, h=7, freq=1)
fcst = nixtla_test_client.forecast(df=integer_freq_series, h=7, freq=1)
train_ends = integer_freq_series.groupby("unique_id", observed=True)["ds"].max()
fcst_ends = fcst.groupby("unique_id", observed=True)["ds"].max()
pd.testing.assert_series_equal(fcst_ends, train_ends + 7)
assert list(nixtla_test_client._model_params.keys()) == [("timegpt-1", "MS")]
def test_api_key_fail():
with delete_env_var("NIXTLA_API_KEY"), delete_env_var("TIMEGPT_TOKEN"):
with pytest.raises(KeyError) as excinfo:
NixtlaClient()
assert "NIXTLA_API_KEY" in str(excinfo.value)
def test_api_key_success():
nixtla_client = NixtlaClient()
assert nixtla_client.validate_api_key()
def test_custom_client_success():
custom_client = NixtlaClient(
base_url=os.environ["NIXTLA_BASE_URL_CUSTOM"],
api_key=os.environ["NIXTLA_API_KEY_CUSTOM"],
)
assert custom_client.validate_api_key()
# assert the usage endpoint
usage = custom_client.usage()
assert sorted(usage.keys()) == ["minute", "month"]
def test_forecast_with_wrong_api_key():
with pytest.raises(Exception) as excinfo:
NixtlaClient(api_key="transphobic").forecast(
df=pd.DataFrame(), h=None, validate_api_key=True
)
assert "nixtla" in str(excinfo.value)
def test_get_model_params(nixtla_test_client):
assert nixtla_test_client._get_model_params(model="timegpt-1", freq="D") == (28, 7)
def test_client_plot(nixtla_test_client, air_passengers_df):
nixtla_test_client.plot(
air_passengers_df, time_col="timestamp", target_col="value", engine="plotly"
)
def test_finetune_cv(nixtla_test_client, air_passengers_df):
finetune_cv = nixtla_test_client.cross_validation(
air_passengers_df,
h=12,
time_col="timestamp",
target_col="value",
n_windows=1,
finetune_steps=1,
)
assert finetune_cv is not None
def test_forecast_warning(nixtla_test_client, air_passengers_df, caplog):
nixtla_test_client.forecast(
df=air_passengers_df.tail(3),
h=100,
time_col="timestamp",
target_col="value",
)
assert 'The specified horizon "h" exceeds the model horizon' in caplog.text
@pytest.mark.parametrize(
"kwargs",
[
{"add_history": True},
],
ids=["short horizon with add_history"],
)
def test_forecast_error(nixtla_test_client, air_passengers_df, kwargs):
with pytest.raises(
ValueError, match="Some series are too short. Please make sure that each series"
):
nixtla_test_client.forecast(
df=air_passengers_df.tail(3),
h=12,
time_col="timestamp",
target_col="value",
**kwargs,
)
def test_large_request_partition_error(nixtla_test_client, large_series):
with pytest.raises(Exception) as excinfo:
nixtla_test_client.forecast(df=large_series, h=1, freq="min", finetune_steps=2)
assert "num_partitions" in str(excinfo.value)
def test_forecast_exogenous_warnings(
nixtla_test_client, two_short_series_with_time_features_train_future
):
train, future = two_short_series_with_time_features_train_future
# features in df but not in X_df
missing_exogenous = train.columns.drop(["unique_id", "ds", "y"]).tolist()
expected_warning = (
f"`df` contains the following exogenous features: {missing_exogenous}, "
"but `X_df` was not provided and they were not declared in `hist_exog_list`. "
"They will be ignored."
)
with warnings.catch_warnings(record=True) as w:
nixtla_test_client.forecast(train, h=5)
assert any(expected_warning in str(warning.message) for warning in w)
# features in df not set as historic nor in X_df
expected_warning = (
"`df` contains the following exogenous features: ['month'], "
"but they were not found in `X_df` nor declared in `hist_exog_list`. "
"They will be ignored."
)
with warnings.catch_warnings(record=True) as w:
nixtla_test_client.forecast(
train, h=5, X_df=future[["unique_id", "ds", "year"]]
)
assert any(expected_warning in str(warning.message) for warning in w)
def test_features_not_in_df_error(
nixtla_test_client, two_short_series_with_time_features_train_future
):
train, future = two_short_series_with_time_features_train_future
with pytest.raises(
ValueError, match="features are present in `X_df` but not in `df`"
):
nixtla_test_client.forecast(
df=train[["unique_id", "ds", "y"]],
h=5,
X_df=future,
)
def test_setting_one_as_historic_and_other_as_future(
nixtla_test_client, two_short_series_with_time_features_train_future
):
train, future = two_short_series_with_time_features_train_future
# test setting one as historic and other as future
nixtla_test_client.forecast(
train, h=5, X_df=future[["unique_id", "ds", "year"]], hist_exog_list=["month"]
)
assert nixtla_test_client.weights_x["features"].tolist() == ["year", "month"]
import pytest
from nixtla_tests.helpers.checks import check_anomalies_dataframe
from nixtla_tests.helpers.checks import check_anomalies_online_dataframe
from nixtla_tests.helpers.checks import check_anomalies_dataframe_diff_cols
from nixtla_tests.helpers.checks import check_forecast_dataframe
from nixtla_tests.helpers.checks import check_forecast_dataframe_diff_cols
from nixtla_tests.helpers.checks import check_forecast_x_dataframe
from nixtla_tests.helpers.checks import check_forecast_x_dataframe_diff_cols
from nixtla_tests.helpers.checks import check_quantiles
pytestmark = pytest.mark.distributed_run
def test_quantiles(nixtla_test_client, dask_df):
check_quantiles(nixtla_test_client, dask_df, id_col="unique_id", time_col="ds")
def test_forecast(nixtla_test_client, dask_df, dask_diff_cols_df, distributed_n_series):
check_forecast_dataframe(
nixtla_test_client, dask_df, n_series_to_check=distributed_n_series
)
check_forecast_dataframe_diff_cols(nixtla_test_client, dask_diff_cols_df)
def test_anomalies(nixtla_test_client, dask_df, dask_diff_cols_df):
check_anomalies_dataframe(nixtla_test_client, dask_df)
check_anomalies_dataframe_diff_cols(nixtla_test_client, dask_diff_cols_df)
def test_anomalies_online(nixtla_test_client, dask_df):
check_anomalies_online_dataframe(nixtla_test_client, dask_df)
def test_forecast_x_dataframe(
nixtla_test_client,
dask_df_x,
dask_future_ex_vars_df,
dask_df_x_diff_cols,
dask_future_ex_vars_df_diff_cols,
):
check_forecast_x_dataframe(nixtla_test_client, dask_df_x, dask_future_ex_vars_df)
check_forecast_x_dataframe_diff_cols(
nixtla_test_client,
dask_df_x_diff_cols,
dask_future_ex_vars_df_diff_cols,
)
def test_detect_anomalies_online_univariate(nixtla_test_client, anomaly_online_df):
df, n_series, detection_size = anomaly_online_df
anomaly_df = nixtla_test_client.detect_anomalies_online(
df,
h=20,
detection_size=detection_size,
threshold_method="univariate",
freq="W-SUN",
level=99,
)
assert len(anomaly_df) == n_series * detection_size
assert (
len(anomaly_df.columns) == 8
) # [unique_id, ds, TimeGPT, y, anomaly, anomaly_score, hi, lo]
assert anomaly_df["anomaly"].sum() == 2
assert anomaly_df["anomaly"].iloc[0] and anomaly_df["anomaly"].iloc[-1]
def test_detect_anomalies_online_multivariate(nixtla_test_client, anomaly_online_df):
df, n_series, detection_size = anomaly_online_df
multi_anomaly_df = nixtla_test_client.detect_anomalies_online(
df,
h=20,
detection_size=detection_size,
threshold_method="multivariate",
freq="W-SUN",
level=99,
)
assert len(multi_anomaly_df) == n_series * detection_size
assert (
len(multi_anomaly_df.columns) == 7
) # [unique_id, ds, TimeGPT, y, anomaly, anomaly_score, accumulated_anomaly_score]
assert multi_anomaly_df["anomaly"].sum() == 4
assert (
multi_anomaly_df["anomaly"].iloc[0]
and multi_anomaly_df["anomaly"].iloc[4]
and multi_anomaly_df["anomaly"].iloc[5]
and multi_anomaly_df["anomaly"].iloc[9]
)
import pytest
from utilsforecast.evaluation import evaluate
from utilsforecast.losses import rmse
from nixtla.nixtla_client import ApiError
from nixtla_tests.helpers.checks import check_finetuned_model
from nixtla_tests.helpers.states import model_ids_object
class TestTimeSeriesDataSet1:
def test_finetuning_and_forecasting(self, custom_client, ts_data_set1):
# Finetune the model
finetune_resp = custom_client.finetune(
ts_data_set1.train, output_model_id=model_ids_object.model_id1
)
assert finetune_resp == model_ids_object.model_id1
model_id2 = custom_client.finetune(
ts_data_set1.train, finetuned_model_id=model_ids_object.model_id1
)
model_ids_object.model_id2 = model_id2 # store the model_id2 for later use
# Forecast with fine-tuned models
forecast_base = custom_client.forecast(ts_data_set1.train, h=ts_data_set1.h)
forecast1 = custom_client.forecast(
ts_data_set1.train,
h=ts_data_set1.h,
finetuned_model_id=model_ids_object.model_id1,
)
forecast2 = custom_client.forecast(
ts_data_set1.train,
h=ts_data_set1.h,
finetuned_model_id=model_ids_object.model_id2,
)
all_fcsts = forecast_base.assign(
ten_rounds=forecast1["TimeGPT"], twenty_rounds=forecast2["TimeGPT"]
)
fcst_rmse = evaluate(
all_fcsts.merge(ts_data_set1.valid),
metrics=[rmse],
agg_fn="mean",
).loc[0]
# error was reduced over 30% by finetuning
assert 1 - fcst_rmse["ten_rounds"] / fcst_rmse["TimeGPT"] > 0.3
# error was reduced over 20% by further finetuning
assert 1 - fcst_rmse["twenty_rounds"] / fcst_rmse["ten_rounds"] > 0.2
# non-existent model returns 404
with pytest.raises(ApiError) as excinfo:
custom_client.forecast(
ts_data_set1.train, h=ts_data_set1.h, finetuned_model_id="unexisting"
)
assert getattr(excinfo.value, "status_code", None) == 404
# Enough data to finetune
_ = custom_client.forecast(
ts_data_set1.train.tail(2),
h=ts_data_set1.h,
finetune_steps=10,
freq="D",
)
def test_cv_with_finetuned_model(self, custom_client, ts_data_set1):
try:
cv_base = custom_client.cross_validation(
ts_data_set1.series, n_windows=2, h=ts_data_set1.h
)
cv_finetune = custom_client.cross_validation(
ts_data_set1.series,
n_windows=2,
h=ts_data_set1.h,
finetuned_model_id=model_ids_object.model_id1,
)
merged = cv_base.merge(
cv_finetune,
on=["unique_id", "ds", "cutoff", "y"],
suffixes=("_base", "_finetune"),
).drop(columns="cutoff")
cv_rmse = evaluate(
merged,
metrics=[rmse],
agg_fn="mean",
).loc[0]
# error was reduced over 30% by finetuning
assert 1 - cv_rmse["TimeGPT_finetune"] / cv_rmse["TimeGPT_base"] > 0.3
finally:
custom_client.delete_finetuned_model(model_ids_object.model_id1)
def test_anomaly_detection_with_finetuned_model(
self, custom_client, ts_anomaly_data
):
anomaly_base = custom_client.detect_anomalies(ts_anomaly_data.train_anomalies)
anomaly_finetune = custom_client.detect_anomalies(
ts_anomaly_data.train_anomalies,
finetuned_model_id=model_ids_object.model_id2,
)
detected_anomalies_base = (
anomaly_base.set_index("ds")
.loc[ts_anomaly_data.anomaly_date, "anomaly"]
.sum()
)
detected_anomalies_finetune = (
anomaly_finetune.set_index("ds")
.loc[ts_anomaly_data.anomaly_date, "anomaly"]
.sum()
)
assert detected_anomalies_base < detected_anomalies_finetune
def test_list_finetuned_models(self, custom_client):
models = custom_client.finetuned_models()
ids = {m.id for m in models}
assert (
model_ids_object.model_id1 not in ids and model_ids_object.model_id2 in ids
)
def test_get_single_finetuned_model(self, custom_client):
single_model = custom_client.finetuned_model(model_ids_object.model_id2)
assert single_model.id == model_ids_object.model_id2
assert single_model.base_model_id == model_ids_object.model_id1
def test_non_existing_model_returns_error(self, custom_client):
with pytest.raises(ApiError, match="Model not found"):
custom_client.finetuned_model("hi")
@pytest.mark.distributed_run
@pytest.mark.ray_run
def test_ray_finetune_model(self, custom_client, ray_df):
check_finetuned_model(custom_client, ray_df, model_ids_object.model_id2)
@pytest.mark.distributed_run
@pytest.mark.spark_run
def test_spark_finetune_model(self, custom_client, spark_df):
check_finetuned_model(custom_client, spark_df, model_ids_object.model_id2)
@pytest.mark.distributed_run
@pytest.mark.flaky(reruns=3, delay=10)
def test_dask_finetune_model(self, custom_client, dask_df):
check_finetuned_model(custom_client, dask_df, model_ids_object.model_id2)
from contextlib import contextmanager
from copy import deepcopy
import httpx
import numpy as np
import pandas as pd
import pytest
import zstandard as zstd
from pydantic import ValidationError
from nixtla_tests.conftest import HYPER_PARAMS_TEST
from nixtla_tests.helpers.checks import (
check_equal_fcsts_add_history,
check_num_partitions_same_results,
)
CAPTURED_REQUEST = None
class CapturingClient(httpx.Client):
def post(self, *args, **kwargs):
request = self.build_request("POST", *args, **kwargs)
global CAPTURED_REQUEST
CAPTURED_REQUEST = {
"headers": dict(request.headers),
"content": request.content,
"method": request.method,
"url": str(request.url),
}
return super().post(*args, **kwargs)
@contextmanager
def capture_request():
original_client = httpx.Client
httpx.Client = CapturingClient
try:
yield
finally:
httpx.Client = original_client
@pytest.mark.parametrize(
"df_converter, freq",
[
pytest.param(lambda series, with_gaps: with_gaps, "5min", id="gaps"),
pytest.param(
lambda series, with_gaps: pd.concat([series, series]),
"5min",
id="duplicates",
),
pytest.param(lambda series, with_gaps: series, "1min", id="wrong_freq"),
],
)
def test_forecast_with_error(series_with_gaps, nixtla_test_client, df_converter, freq):
series, with_gaps = series_with_gaps
with pytest.raises(
ValueError,
match="missing or duplicate timestamps, or the timestamps do not match",
):
nixtla_test_client.forecast(df=df_converter(series, with_gaps), h=1, freq=freq)
@pytest.mark.parametrize("test_params, expected_exception, expected_error_msg",
[
({"model_parameters": None}, None, ""),
({"model_parameters": {"max_q": 1}}, None, ""),
({"model_parameters": {"max_p": None}}, None, ""),
({"model_parameters": {"horizon": [1, 2, 3]}}, None, ""),
({"model_parameters": {"horizon": (1, 2, 3)}}, None, ""),
({"model_parameters": {"horizon": {"nested": "dict"}}}, None, ""),
({"model_parameters": {"horizon": {"nested": None}}}, None, ""),
({"model_parameters": "not a dict"}, ValidationError, "Input should be a valid dictionary"),
({"model_parameters": 123}, ValidationError, "Input should be a valid dictionary"),
({"model_parameters": {"horizon": {"nested_key": [1, 2, 3]}}}, TypeError, "Invalid value type"),
({"model_parameters": {"horizon": {"nested_key": (1, 2, 3)}}}, TypeError, "Invalid value type"),
({"model_parameters": {"horizon": {"nested_key": {1, 2}}}}, TypeError, "Invalid value type"),
({"model_parameters": {"horizon": {"nested_key": {"inner_key": "val"}}}}, TypeError, "Invalid value type"),
({"model_parameters": {"horizon": pd.DataFrame()}}, TypeError, "Invalid value type"),
]
)
@pytest.mark.parametrize("endpoint", ["forecast", "cross_validation"])
def test_model_parameters(nixtla_test_client, air_passengers_df, test_params, expected_exception, expected_error_msg, endpoint):
base_params = {
"df": air_passengers_df,
"h": 12,
"time_col": "timestamp",
"target_col": "value",
}
base_params.update(test_params)
if expected_exception is None:
if endpoint == "forecast":
nixtla_test_client.forecast(**base_params)
elif endpoint == "cross_validation":
nixtla_test_client.cross_validation(**base_params)
else:
with pytest.raises(expected_exception) as exc_info:
if endpoint == "forecast":
nixtla_test_client.forecast(**base_params)
elif endpoint == "cross_validation":
nixtla_test_client.cross_validation(**base_params)
assert expected_error_msg in str(exc_info.value)
def test_cv_forecast_consistency(nixtla_test_client, cv_series_with_features):
series_with_features, train, valid, x_cols, h, freq = cv_series_with_features
for hist_exog_list in [None, [], [x_cols[2], x_cols[1]], x_cols]:
cv_res = nixtla_test_client.cross_validation(
series_with_features,
n_windows=1,
h=h,
freq=freq,
hist_exog_list=hist_exog_list,
)
fcst_res = nixtla_test_client.forecast(
train,
h=h,
freq=freq,
hist_exog_list=hist_exog_list,
X_df=valid,
)
np.testing.assert_allclose(
cv_res["TimeGPT"], fcst_res["TimeGPT"], atol=1e-4, rtol=1e-3
)
def test_forecast_different_hist_exog_gives_different_results(
nixtla_test_client, cv_series_with_features
):
_, train, valid, x_cols, h, freq = cv_series_with_features
for X_df in (None, valid):
res1 = nixtla_test_client.forecast(
train, h=h, X_df=X_df, freq=freq, hist_exog_list=x_cols[:2]
)
res2 = nixtla_test_client.forecast(
train, h=h, X_df=X_df, freq=freq, hist_exog_list=x_cols[2:]
)
with pytest.raises(AssertionError):
np.testing.assert_allclose(
res1["TimeGPT"],
res2["TimeGPT"],
atol=1e-4,
rtol=1e-3,
)
def test_forecast_date_features_multiple_series_and_different_ends(
nixtla_test_client, two_short_series
):
h = 12
fcst_test_series = nixtla_test_client.forecast(
two_short_series, h=h, date_features=["dayofweek"]
)
uids = two_short_series["unique_id"]
for uid in uids:
expected = pd.date_range(
periods=h + 1, start=two_short_series.query("unique_id == @uid")["ds"].max()
)[1:].tolist()
actual = fcst_test_series.query("unique_id == @uid")["ds"].tolist()
assert actual == expected
def test_compression(nixtla_test_client, series_1MB_payload):
with capture_request():
nixtla_test_client.forecast(
df=series_1MB_payload,
freq="D",
h=1,
hist_exog_list=["static_0", "static_1"],
)
assert CAPTURED_REQUEST["headers"]["content-encoding"] == "zstd"
content = CAPTURED_REQUEST["content"]
assert len(content) < 2**20
assert len(zstd.ZstdDecompressor().decompress(content)) > 2**20
def test_cv_refit_equivalence(nixtla_test_client, air_passengers_df):
cv_kwargs = dict(
df=air_passengers_df,
n_windows=2,
h=12,
freq="MS",
time_col="timestamp",
target_col="value",
finetune_steps=2,
)
res_refit = nixtla_test_client.cross_validation(refit=True, **cv_kwargs)
res_no_refit = nixtla_test_client.cross_validation(refit=False, **cv_kwargs)
np.testing.assert_allclose(res_refit["value"], res_no_refit["value"])
with pytest.raises(AssertionError):
np.testing.assert_allclose(
res_refit["TimeGPT"],
res_no_refit["TimeGPT"],
atol=1e-4,
rtol=1e-3,
)
def test_forecast_quantiles_error(nixtla_test_client, air_passengers_df):
with pytest.raises(Exception) as excinfo:
nixtla_test_client.forecast(
df=air_passengers_df,
h=12,
time_col="timestamp",
target_col="value",
level=[80],
quantiles=[0.2, 0.3],
)
assert "not both" in str(excinfo.value)
@pytest.mark.parametrize(
"method,kwargs",
[
("forecast", {}),
("forecast", {"add_history": True}),
("cross_validation", {}),
],
)
def test_forecast_quantiles_output(
nixtla_test_client, air_passengers_df, method, kwargs
):
test_qls = list(np.arange(0.1, 1, 0.1))
exp_q_cols = [f"TimeGPT-q-{int(100 * q)}" for q in test_qls]
args = {
"df": air_passengers_df,
"h": 12,
"time_col": "timestamp",
"target_col": "value",
"quantiles": test_qls,
**kwargs,
}
if method == "cross_validation":
func = nixtla_test_client.cross_validation
elif method == "forecast":
func = nixtla_test_client.forecast
df_qls = func(**args)
assert all(col in df_qls.columns for col in exp_q_cols)
assert not any("-lo-" in col for col in df_qls.columns)
# test monotonicity of quantiles
for c1, c2 in zip(exp_q_cols[:-1], exp_q_cols[1:]):
assert df_qls[c1].lt(df_qls[c2]).all()
@pytest.mark.parametrize("freq", ["D", "W-THU", "Q-DEC", "15T"])
@pytest.mark.parametrize(
"method_name,method_kwargs,exog",
[
("detect_anomalies", {"level": 98}, False),
("cross_validation", {"h": 7, "n_windows": 2}, False),
("forecast", {"h": 7, "add_history": True}, False),
("detect_anomalies", {"level": 98}, True),
("cross_validation", {"h": 7, "n_windows": 2}, False),
("forecast", {"h": 7, "add_history": True}, False),
],
)
def test_num_partitions_same_results_parametrized(
nixtla_test_client, df_freq_generator, method_name, method_kwargs, freq, exog
):
mathod_mapper = {
"detect_anomalies": nixtla_test_client.detect_anomalies,
"cross_validation": nixtla_test_client.cross_validation,
"forecast": nixtla_test_client.forecast,
}
method = mathod_mapper[method_name]
df_freq = df_freq_generator(n_series=10, min_length=500, max_length=550, freq=freq)
df_freq["ds"] = df_freq.groupby("unique_id", observed=True)["ds"].transform(
lambda x: pd.date_range(periods=len(x), freq=freq, end="2023-01-01")
)
if exog:
df_freq["exog_1"] = 1
kwargs = {
"method": method,
"num_partitions": 2,
"df": df_freq,
**method_kwargs,
}
check_num_partitions_same_results(**kwargs)
@pytest.mark.parametrize(
"freq,h",
[
("D", 7),
("W-THU", 52),
("Q-DEC", 8),
("15T", 4 * 24 * 7),
],
)
def test_forecast_models_different_results(
nixtla_test_client, df_freq_generator, freq, h
):
df_freq = df_freq_generator(n_series=10, min_length=500, max_length=550, freq=freq)
df_freq["ds"] = df_freq.groupby("unique_id", observed=True)["ds"].transform(
lambda x: pd.date_range(periods=len(x), freq=freq, end="2023-01-01")
)
kwargs = dict(df=df_freq, h=h)
fcst_1_df = check_equal_fcsts_add_history(
nixtla_test_client, **{**kwargs, "model": "timegpt-1"}
)
fcst_2_df = check_equal_fcsts_add_history(
nixtla_test_client, **{**kwargs, "model": "timegpt-1-long-horizon"}
)
with pytest.raises(
AssertionError, match=r'\(column name="TimeGPT"\) are different'
):
pd.testing.assert_frame_equal(fcst_1_df, fcst_2_df)
@pytest.mark.parametrize(
"method, method_kwargs",
[
(
"forecast",
dict(
h=12,
level=[90, 95],
add_history=True,
time_col="timestamp",
target_col="value",
),
),
(
"cross_validation",
dict(h=12, level=[90, 95], time_col="timestamp", target_col="value"),
),
("detect_anomalies", dict(level=99, time_col="timestamp", target_col="value")),
],
)
def test_different_models_give_different_results(
air_passengers_df, nixtla_test_client, method, method_kwargs
):
mathod_mapper = {
"detect_anomalies": nixtla_test_client.detect_anomalies,
"cross_validation": nixtla_test_client.cross_validation,
"forecast": nixtla_test_client.forecast,
}
execute = mathod_mapper[method]
# Run with first model
out1 = execute(df=air_passengers_df, model="timegpt-1", **method_kwargs)
# Run with second model
out2 = execute(
df=air_passengers_df, model="timegpt-1-long-horizon", **method_kwargs
)
# Compare only the TimeGPT column
with pytest.raises(
AssertionError, match=r'\(column name="TimeGPT"\) are different'
):
pd.testing.assert_frame_equal(out1[["TimeGPT"]], out2[["TimeGPT"]])
# test unsupported model
method_kwargs["model"] = "my-awesome-model"
with pytest.raises(ValueError, match="unsupported model"):
execute(df=air_passengers_df, **method_kwargs)
def test_shap_features(nixtla_test_client, date_features_result):
# Test shap values are returned and sum to predictions
df_date_features, future_df, _ = date_features_result
h = 12
fcst_df = nixtla_test_client.forecast(
df=df_date_features, h=h, X_df=future_df, feature_contributions=True
)
shap_values = nixtla_test_client.feature_contributions
assert len(shap_values) == len(fcst_df)
np.testing.assert_allclose(
fcst_df["TimeGPT"].values, shap_values.iloc[:, 3:].sum(axis=1).values, rtol=1e-3
)
fcst_hist_df = nixtla_test_client.forecast(
df=df_date_features,
h=h,
X_df=future_df,
add_history=True,
feature_contributions=True,
)
shap_values_hist = nixtla_test_client.feature_contributions
assert len(shap_values_hist) == len(fcst_hist_df)
np.testing.assert_allclose(
fcst_hist_df["TimeGPT"].values,
shap_values_hist.iloc[:, 3:].sum(axis=1).values,
atol=1e-4,
)
# test num partitions
_ = nixtla_test_client.feature_contributions
pd.testing.assert_frame_equal(
nixtla_test_client.feature_contributions, shap_values_hist, atol=1e-4, rtol=1e-3
)
@pytest.mark.parametrize("hyp", HYPER_PARAMS_TEST)
def test_exogenous_variables_cv(nixtla_test_client, exog_data, hyp):
df_ex_, df_train, df_test, x_df_test = exog_data
fcst_test = nixtla_test_client.forecast(
df_train.merge(df_ex_.drop(columns="y")), h=12, X_df=x_df_test, **hyp
)
fcst_test = df_test[["unique_id", "ds", "y"]].merge(fcst_test)
fcst_test = fcst_test.sort_values(["unique_id", "ds"]).reset_index(drop=True)
fcst_cv = nixtla_test_client.cross_validation(df_ex_, h=12, **hyp)
fcst_cv = fcst_cv.sort_values(["unique_id", "ds"]).reset_index(drop=True)
pd.testing.assert_frame_equal(
fcst_test,
fcst_cv.drop(columns="cutoff"),
atol=1e-4,
rtol=1e-3,
)
@pytest.mark.parametrize("hyp", HYPER_PARAMS_TEST)
def test_forecast_vs_cv_no_exog(
nixtla_test_client, train_test_split, air_passengers_renamed_df, hyp
):
df_train, df_test = train_test_split
fcst_test = nixtla_test_client.forecast(df_train, h=12, **hyp)
fcst_test = df_test[["unique_id", "ds", "y"]].merge(fcst_test)
fcst_test = fcst_test.sort_values(["unique_id", "ds"]).reset_index(drop=True)
fcst_cv = nixtla_test_client.cross_validation(
air_passengers_renamed_df, h=12, **hyp
)
fcst_cv = fcst_cv.sort_values(["unique_id", "ds"]).reset_index(drop=True)
pd.testing.assert_frame_equal(
fcst_test,
fcst_cv.drop(columns="cutoff"),
rtol=1e-2,
)
@pytest.mark.parametrize("hyp", HYPER_PARAMS_TEST)
def test_forecast_vs_cv_insert_y(
nixtla_test_client, train_test_split, air_passengers_renamed_df, hyp
):
df_train, df_test = train_test_split
fcst_test = nixtla_test_client.forecast(df_train, h=12, **hyp)
fcst_test.insert(2, "y", df_test["y"].values)
fcst_test = fcst_test.sort_values(["unique_id", "ds"]).reset_index(drop=True)
fcst_cv = nixtla_test_client.cross_validation(
air_passengers_renamed_df, h=12, **hyp
)
fcst_cv = fcst_cv.sort_values(["unique_id", "ds"]).reset_index(drop=True)
pd.testing.assert_frame_equal(
fcst_test,
fcst_cv.drop(columns="cutoff"),
rtol=1e-2,
)
def test_forecast_and_anomalies_index_vs_columns(
nixtla_test_client, air_passengers_renamed_df, air_passengers_renamed_df_with_index
):
fcst_inferred_df_index = nixtla_test_client.forecast(
air_passengers_renamed_df_with_index, h=10
)
anom_inferred_df_index = nixtla_test_client.detect_anomalies(
air_passengers_renamed_df_with_index
)
fcst_inferred_df = nixtla_test_client.forecast(
air_passengers_renamed_df[["ds", "unique_id", "y"]], h=10
)
anom_inferred_df = nixtla_test_client.detect_anomalies(
air_passengers_renamed_df[["ds", "unique_id", "y"]]
)
pd.testing.assert_frame_equal(
fcst_inferred_df_index, fcst_inferred_df, atol=1e-4, rtol=1e-3
)
pd.testing.assert_frame_equal(
anom_inferred_df_index, anom_inferred_df, atol=1e-4, rtol=1e-3
)
@pytest.mark.parametrize("freq", ["Y", "W-MON", "Q-DEC", "H"])
def test_forecast_index_vs_columns_various_freq(
nixtla_test_client, air_passengers_renamed_df_with_index, freq
):
df_ds_index = air_passengers_renamed_df_with_index.groupby("unique_id").tail(80)
df_ds_index.index = np.concatenate(
df_ds_index["unique_id"].nunique()
* [pd.date_range(end="2023-01-01", periods=80, freq=freq)]
)
df_ds_index.index.name = "ds"
fcst_inferred_df_index = nixtla_test_client.forecast(df_ds_index, h=10)
df_test = df_ds_index.reset_index()
fcst_inferred_df = nixtla_test_client.forecast(df_test, h=10)
pd.testing.assert_frame_equal(
fcst_inferred_df_index, fcst_inferred_df, atol=1e-4, rtol=1e-3
)
def test_index_as_time_col(nixtla_test_client, air_passengers_df):
df_test = deepcopy(air_passengers_df)
df_test["timestamp"] = pd.to_datetime(df_test["timestamp"])
df_test.set_index(df_test["timestamp"], inplace=True)
df_test.drop(columns="timestamp", inplace=True)
# Using user_provided time_col and freq
timegpt_anomalies_df_1 = nixtla_test_client.detect_anomalies(
air_passengers_df, time_col="timestamp", target_col="value", freq="M"
)
# Infer time_col and freq from index
timegpt_anomalies_df_2 = nixtla_test_client.detect_anomalies(
df_test, time_col="timestamp", target_col="value"
)
pd.testing.assert_frame_equal(
timegpt_anomalies_df_1,
timegpt_anomalies_df_2,
atol=1e-4,
rtol=1e-3,
)
import platform
import sys
import pytest
from nixtla_tests.helpers.checks import (
check_anomalies_dataframe,
check_anomalies_dataframe_diff_cols,
check_anomalies_online_dataframe,
check_forecast_dataframe,
check_forecast_dataframe_diff_cols,
check_forecast_x_dataframe,
check_forecast_x_dataframe_diff_cols,
check_quantiles,
)
pytestmark = [
pytest.mark.distributed_run,
pytest.mark.ray_run,
]
def test_quantiles(nixtla_test_client, ray_df):
check_quantiles(nixtla_test_client, ray_df, id_col="unique_id", time_col="ds")
def test_forecast(nixtla_test_client, ray_df, ray_diff_cols_df, distributed_n_series):
check_forecast_dataframe(
nixtla_test_client, ray_df, n_series_to_check=distributed_n_series
)
check_forecast_dataframe_diff_cols(nixtla_test_client, ray_diff_cols_df)
def test_anomalies(nixtla_test_client, ray_df, ray_diff_cols_df):
check_anomalies_dataframe(nixtla_test_client, ray_df)
check_anomalies_dataframe_diff_cols(nixtla_test_client, ray_diff_cols_df)
def test_anomalies_online(nixtla_test_client, ray_df):
check_anomalies_online_dataframe(nixtla_test_client, ray_df)
@pytest.mark.xfail(
reason=(
"triad.collections.schema.SchemaError: Schema can't be empty"
"error triggered https://github.com/Nixtla/nixtla/blob/b56a89bf6b80b137c57f3511eef3ed8857705a59/nixtla/nixtla_client.py#L1383"
)
)
def test_forecast_x_dataframe(
nixtla_test_client,
ray_df_x,
ray_future_ex_vars_df,
ray_df_x_diff_cols,
ray_future_ex_vars_df_diff_cols,
):
check_forecast_x_dataframe(nixtla_test_client, ray_df_x, ray_future_ex_vars_df)
check_forecast_x_dataframe_diff_cols(
nixtla_test_client,
ray_df_x_diff_cols,
ray_future_ex_vars_df_diff_cols,
)
import httpx
import pytest
import time
from itertools import product
from nixtla.nixtla_client import (
ApiError,
)
from nixtla_tests.helpers.checks import check_retry_behavior
def raise_api_error_with_text(*args, **kwargs):
raise ApiError(
status_code=503,
body="""
<html><head>
<meta http-equiv="content-type" content="text/html;charset=utf-8">
<title>503 Server Error</title>
</head>
<body text=#000000 bgcolor=#ffffff>
<h1>Error: Server Error</h1>
<h2>The service you requested is not available at this time.<p>Service error -27.</h2>
<h2></h2>
</body></html>
""",
)
def raise_api_error_with_json(*args, **kwargs):
raise ApiError(
status_code=422,
body=dict(detail="Please use numbers"),
)
def raise_read_timeout_error(*args, **kwargs):
sleep_seconds = 5
print(f"raising ReadTimeout error after {sleep_seconds} seconds")
time.sleep(sleep_seconds)
raise httpx.ReadTimeout("Timed out")
def raise_http_error(*args, **kwargs):
print("raising HTTP error")
raise ApiError(status_code=503, body="HTTP error")
@pytest.mark.parametrize(
"side_effect,side_effect_exception,should_retry",
[
(raise_api_error_with_text, ApiError, True),
(raise_api_error_with_json, ApiError, False),
],
)
def test_retry_behavior(
air_passengers_df, side_effect, side_effect_exception, should_retry
):
check_retry_behavior(
df=air_passengers_df,
side_effect=side_effect,
side_effect_exception=side_effect_exception,
should_retry=should_retry,
)
combs = [
(2, 5, 30),
(10, 1, 5),
]
side_effect_settings = [
(raise_read_timeout_error, httpx.ReadTimeout),
(raise_http_error, ApiError),
]
@pytest.mark.parametrize(
"retry_settings,side_effect_settings", product(combs, side_effect_settings)
)
def test_retry_behavior_set2(air_passengers_df, retry_settings, side_effect_settings):
max_retries, retry_interval, max_wait_time = retry_settings
side_effect, side_effect_exception = side_effect_settings
check_retry_behavior(
df=air_passengers_df,
side_effect=side_effect,
side_effect_exception=side_effect_exception,
max_retries=max_retries,
retry_interval=retry_interval,
max_wait_time=max_wait_time,
)
import pytest
from nixtla_tests.helpers.checks import check_anomalies_dataframe
from nixtla_tests.helpers.checks import check_anomalies_online_dataframe
from nixtla_tests.helpers.checks import check_anomalies_dataframe_diff_cols
from nixtla_tests.helpers.checks import check_forecast_dataframe
from nixtla_tests.helpers.checks import check_forecast_dataframe_diff_cols
from nixtla_tests.helpers.checks import check_forecast_x_dataframe
from nixtla_tests.helpers.checks import check_forecast_x_dataframe_diff_cols
from nixtla_tests.helpers.checks import check_quantiles
pytestmark = [
pytest.mark.distributed_run,
pytest.mark.spark_run
]
def test_quantiles(nixtla_test_client, spark_df):
check_quantiles(nixtla_test_client, spark_df, id_col="unique_id", time_col="ds")
def test_forecast(
nixtla_test_client, spark_df, spark_diff_cols_df, distributed_n_series
):
check_forecast_dataframe(
nixtla_test_client, spark_df, n_series_to_check=distributed_n_series
)
check_forecast_dataframe_diff_cols(nixtla_test_client, spark_diff_cols_df)
def test_anomalies(nixtla_test_client, spark_df, spark_diff_cols_df):
check_anomalies_dataframe(nixtla_test_client, spark_df)
check_anomalies_dataframe_diff_cols(nixtla_test_client, spark_diff_cols_df)
def test_anomalies_online(nixtla_test_client, spark_df):
check_anomalies_online_dataframe(nixtla_test_client, spark_df)
def test_forecast_x_dataframe(
nixtla_test_client,
spark_df_x,
spark_future_ex_vars_df,
spark_df_x_diff_cols,
spark_future_ex_vars_df_diff_cols,
):
check_forecast_x_dataframe(nixtla_test_client, spark_df_x, spark_future_ex_vars_df)
check_forecast_x_dataframe_diff_cols(
nixtla_test_client, spark_df_x_diff_cols, spark_future_ex_vars_df_diff_cols
)
import re
import pandas as pd
import pytest
from nixtla.nixtla_client import _audit_duplicate_rows
from nixtla.nixtla_client import _audit_categorical_variables
from nixtla.nixtla_client import _audit_leading_zeros
from nixtla.nixtla_client import _audit_missing_dates
from nixtla.nixtla_client import _audit_negative_values
from nixtla.nixtla_client import _model_in_list
from nixtla.nixtla_client import _maybe_add_date_features
from nixtla.nixtla_client import AuditDataSeverity
from nixtla.date_features import SpecialDates
@pytest.mark.parametrize(
"name, patterns, expected",
[
("a", ("a", "b"), True),
("a", ("b", "c"), False),
("axb", ("x", re.compile("a.*b")), True),
("axb", ("x", re.compile("^a.*b$")), True),
("a-b", ("x", re.compile("^a-.*b$")), True),
("a-dfdfb", ("x", re.compile("^a-.*b$")), True),
("abc", ("x", re.compile("ab"), re.compile("abcd")), False),
],
)
def test_model_in_list(name, patterns, expected):
assert _model_in_list(name, patterns) is expected
def test_audit_duplicate_rows_pass(df_no_duplicates):
audit, duplicates = _audit_duplicate_rows(df_no_duplicates)
assert audit == AuditDataSeverity.PASS
assert len(duplicates) == 0
def test_audit_duplicate_rows_fail(df_with_duplicates):
audit, duplicates = _audit_duplicate_rows(df_with_duplicates)
assert audit == AuditDataSeverity.FAIL
assert len(duplicates) == 2
def test_audit_missing_dates_complete(df_complete):
audit, missing = _audit_missing_dates(df_complete, freq="D")
assert audit == AuditDataSeverity.PASS
assert len(missing) == 0
def test_audit_missing_dates_with_missing(df_missing):
audit, missing = _audit_missing_dates(df_missing, freq="D")
assert audit == AuditDataSeverity.FAIL
assert len(missing) == 2 # One missing date per unique_id
# --- Audit Categorical Variables ---
def test_audit_categorical_variables_no_cat(df_no_cat):
audit, cat_df = _audit_categorical_variables(df_no_cat)
assert audit == AuditDataSeverity.PASS
assert len(cat_df) == 0
def test_audit_categorical_variables_with_cat(df_with_cat):
audit, cat_df = _audit_categorical_variables(df_with_cat)
assert audit == AuditDataSeverity.FAIL
assert cat_df.shape[1] == 1 # Should include only 'cat_col'
def test_audit_categorical_variables_with_cat_dtype(df_with_cat_dtype):
audit, cat_df = _audit_categorical_variables(df_with_cat_dtype)
assert audit == AuditDataSeverity.FAIL
assert cat_df.shape[1] == 1 # Should include only 'cat_col'
def test_audit_leading_zeros(df_leading_zeros):
audit, leading_zeros_df = _audit_leading_zeros(df_leading_zeros)
assert audit == AuditDataSeverity.CASE_SPECIFIC
assert len(leading_zeros_df) == 3
def test_audit_negative_values(df_negative_values):
audit, negative_values_df = _audit_negative_values(df_negative_values)
assert audit == AuditDataSeverity.CASE_SPECIFIC
assert len(negative_values_df) == 3
@pytest.mark.parametrize(
"date_features,freq,one_hot,expected_date_features",
[
(["year", "month"], "MS", False, ["year", "month"]),
(
[
SpecialDates(
{"first_dates": ["2021-01-1"], "second_dates": ["2021-01-01"]}
)
],
"D",
False,
["first_dates", "second_dates"],
),
(["year", "month"], "D", ["month"], ["month_" + str(i) for i in range(1, 13)]),
],
)
def test_maybe_add_date_features(
air_passengers_df, date_features, freq, one_hot, expected_date_features
):
df_copy = air_passengers_df.copy()
df_copy.rename(columns={"timestamp": "ds", "value": "y"}, inplace=True)
df_copy.insert(0, "unique_id", "AirPassengers")
df_date_features, future_df = _maybe_add_date_features(
df=df_copy,
X_df=None,
h=12,
freq=freq,
features=date_features,
one_hot=one_hot,
id_col="unique_id",
time_col="ds",
target_col="y",
)
assert all(col in df_date_features for col in expected_date_features)
assert all(col in future_df for col in expected_date_features)
@pytest.mark.parametrize(
"date_features,one_hot,expected_date_features",
[
(["year", "month"], False, ["year", "month"]),
(["month", "day"], ["month", "day"], ["month_" + str(i) for i in range(1, 13)]),
],
ids=["no_one_hot", "with_one_hot"],
)
def test_add_date_features_with_exogenous_variables(
air_passengers_df, date_features, one_hot, expected_date_features, request
):
df_copy = air_passengers_df.copy()
df_copy.rename(columns={"timestamp": "ds", "value": "y"}, inplace=True)
df_copy.insert(0, "unique_id", "AirPassengers")
df_actual_future = df_copy.tail(12)[["unique_id", "ds"]]
df_date_features, future_df = _maybe_add_date_features(
df=df_copy,
X_df=df_actual_future,
h=24,
freq="H",
features=date_features,
one_hot=one_hot,
id_col="unique_id",
time_col="ds",
target_col="y",
)
assert all(col in df_date_features for col in expected_date_features)
assert all(col in future_df for col in expected_date_features)
pd.testing.assert_frame_equal(
df_date_features[df_copy.columns],
df_copy,
)
if request.node.callspec.id == "no_one_hot":
expected_df_actual_future = df_actual_future.copy()
elif request.node.callspec.id == "with_one_hot":
expected_df_actual_future = df_actual_future.reset_index(drop=True)
pd.testing.assert_frame_equal(
future_df[df_actual_future.columns],
expected_df_actual_future,
)
[build-system]
requires = ["setuptools>=36.2", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "nixtla"
dynamic = ["version"]
description = "Python SDK for Nixtla API (TimeGPT)"
authors = [
{name = "Nixtla", email = "business@nixtla.io"}
]
license = {text = "Apache Software License 2.0"}
readme = "README.md"
requires-python = ">=3.9"
keywords = ["time-series", "forecasting", "gpt"]
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"Natural Language :: English",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]
dependencies = [
"annotated-types",
"httpx[zstd]",
"orjson",
"pandas",
"pydantic>=1.10",
"tenacity",
"tqdm",
"utilsforecast>=0.2.8",
]
[tool.setuptools.dynamic]
version = {attr = "nixtla.__version__"}
[project.optional-dependencies]
dev = [
"black",
"datasetsforecast",
"fire",
"hierarchicalforecast",
"ipython<=8.32.0",
"ipywidgets",
"jupyterlab",
"neuralforecast",
"numpy<2",
"plotly",
"polars",
"pre-commit",
"pyreadr<0.5.3",
"python-dotenv",
"pyyaml",
"setuptools<70",
"statsforecast",
"tabulate",
"shap",
"pytest",
"pytest-cov",
"pytest-rerunfailures",
"pyarrow<21.0.0",
"mlforecast",
"lightgbm",
"utilsforecast[plotting]",
"holidays",
"pandas_market_calendars",
"pip-licenses"
]
distributed = [
"fugue[dask,ray,spark]>=0.8.7",
"dask<=2024.12.1",
"pandas<2.2",
"ray<=2.20.0",
]
plotting = [
"utilsforecast[plotting]",
]
date_extras = [
"holidays",
"pandas_market_calendars",
]
[project.urls]
Homepage = "https://github.com/Nixtla/nixtla/"
Documentation = "https://nixtlaverse.nixtla.io/"
Repository = "https://github.com/Nixtla/nixtla/"
[tool.setuptools]
include-package-data = true
[tool.setuptools.packages.find]
exclude = ["action_files*"]
[tool.ruff.lint]
select = [
"F", # pyflakes
]
[tool.pytest.ini_options]
markers = [
"distributed_run: mark test as requiring distributed run, such as those depending on Ray, Spark frameworks",
"spark_run: mark test execution related to Spark framework",
"ray_run: mark test execution related to Ray framework",
]
testpaths = ["nixtla_tests"]
addopts = [
"--cov=python/statsforecast",
"--cov-report=term-missing",
"--cov-report=html",
"--cov-fail-under=80"
]
import pandas as pd
df = pd.read_csv('third_party_licenses.csv')
df = df[df['License'].str.contains('GPL|AGPL|LGPL|MPL', na=False)]
# if the license has a long agreement, only capture the title and skip the rest
df['License'] = df['License'].apply(lambda x: x.split('\n')[0])
df = df[~df['Name'].str.contains('quadprog')] # ignore quadprog
df.to_markdown('THIRD_PARTY_LICENSES.md', index=False)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment