Unverified Commit df06fb1f authored by Kashif Rasul's avatar Kashif Rasul Committed by GitHub
Browse files

Time series transformer: input projection and Std scaler (#21020)



* added loc and scale outputs from scalers

* fix typo

* fix tests

* fixed formatting

* initial StdScaler

* move scaling to optional str

* calculate std feature for scalers

* undid change as it does not help

* added StdScaler with weights

* added input projection layer and d_model hyperparam

* use linear proj

* add back layernorm_embedding

* add sin-cos pos embeddings

* updated scalers

* formatting

* fix type

* fixed test

* fix repeated_past_values cal.

* fix when keepdim=false

* fix default_scale

* backward compatibility of scaling config

* update integration test expected output

* fix style

* fix docs

* use the actual num_static_real_features in feature_dim cal

* clarified docs

* Update src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* prediction_length is not optional

* fix for reviewer

* Update src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* get rid of un-needed new lines

* fix doc

* remove unneeded new lines

* fix style

* static_categorical_features and static_real_features are optional

* fix integration test

* Update src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* fixing docs for multivariate setting

* documentation for generate

---------
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent bb5a2f2f
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
""" Time Series Transformer model configuration""" """ Time Series Transformer model configuration"""
from typing import List, Optional from typing import List, Optional, Union
from ...configuration_utils import PretrainedConfig from ...configuration_utils import PretrainedConfig
from ...utils import logging from ...utils import logging
...@@ -56,8 +56,9 @@ class TimeSeriesTransformerConfig(PretrainedConfig): ...@@ -56,8 +56,9 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
input_size (`int`, *optional*, defaults to 1): input_size (`int`, *optional*, defaults to 1):
The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
multivariate targets. multivariate targets.
scaling (`bool`, *optional* defaults to `True`): scaling (`string` or `bool`, *optional* defaults to `"mean"`):
Whether to scale the input targets. Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
scaler is set to "mean".
lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`): lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`):
The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4, The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4,
5, 6, 7]`. 5, 6, 7]`.
...@@ -77,6 +78,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig): ...@@ -77,6 +78,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
The dimension of the embedding for each of the static categorical features. Should be a list of integers, The dimension of the embedding for each of the static categorical features. Should be a list of integers,
having the same length as `num_static_categorical_features`. Cannot be `None` if having the same length as `num_static_categorical_features`. Cannot be `None` if
`num_static_categorical_features` is > 0. `num_static_categorical_features` is > 0.
d_model (`int`, *optional*, defaults to 64):
Dimensionality of the transformer layers.
encoder_layers (`int`, *optional*, defaults to 2): encoder_layers (`int`, *optional*, defaults to 2):
Number of encoder layers. Number of encoder layers.
decoder_layers (`int`, *optional*, defaults to 2): decoder_layers (`int`, *optional*, defaults to 2):
...@@ -132,13 +135,13 @@ class TimeSeriesTransformerConfig(PretrainedConfig): ...@@ -132,13 +135,13 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
input_size: int = 1,
prediction_length: Optional[int] = None, prediction_length: Optional[int] = None,
context_length: Optional[int] = None, context_length: Optional[int] = None,
distribution_output: str = "student_t", distribution_output: str = "student_t",
loss: str = "nll", loss: str = "nll",
input_size: int = 1,
lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7], lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7],
scaling: bool = True, scaling: Optional[Union[str, bool]] = "mean",
num_dynamic_real_features: int = 0, num_dynamic_real_features: int = 0,
num_static_categorical_features: int = 0, num_static_categorical_features: int = 0,
num_static_real_features: int = 0, num_static_real_features: int = 0,
...@@ -153,6 +156,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig): ...@@ -153,6 +156,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
decoder_layers: int = 2, decoder_layers: int = 2,
is_encoder_decoder: bool = True, is_encoder_decoder: bool = True,
activation_function: str = "gelu", activation_function: str = "gelu",
d_model: int = 64,
dropout: float = 0.1, dropout: float = 0.1,
encoder_layerdrop: float = 0.1, encoder_layerdrop: float = 0.1,
decoder_layerdrop: float = 0.1, decoder_layerdrop: float = 0.1,
...@@ -182,7 +186,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig): ...@@ -182,7 +186,7 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
) )
self.cardinality = cardinality self.cardinality = cardinality
else: else:
self.cardinality = [1] self.cardinality = [0]
if embedding_dimension and num_static_categorical_features > 0: if embedding_dimension and num_static_categorical_features > 0:
if len(embedding_dimension) != num_static_categorical_features: if len(embedding_dimension) != num_static_categorical_features:
raise ValueError( raise ValueError(
...@@ -194,7 +198,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig): ...@@ -194,7 +198,8 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
self.num_parallel_samples = num_parallel_samples self.num_parallel_samples = num_parallel_samples
# Transformer architecture configuration # Transformer architecture configuration
self.d_model = input_size * len(lags_sequence) + self._number_of_features self.feature_size = input_size * len(lags_sequence) + self._number_of_features
self.d_model = d_model
self.encoder_attention_heads = encoder_attention_heads self.encoder_attention_heads = encoder_attention_heads
self.decoder_attention_heads = decoder_attention_heads self.decoder_attention_heads = decoder_attention_heads
self.encoder_ffn_dim = encoder_ffn_dim self.encoder_ffn_dim = encoder_ffn_dim
...@@ -224,6 +229,6 @@ class TimeSeriesTransformerConfig(PretrainedConfig): ...@@ -224,6 +229,6 @@ class TimeSeriesTransformerConfig(PretrainedConfig):
sum(self.embedding_dimension) sum(self.embedding_dimension)
+ self.num_dynamic_real_features + self.num_dynamic_real_features
+ self.num_time_features + self.num_time_features
+ max(1, self.num_static_real_features) # there is at least one dummy static real feature + self.num_static_real_features
+ self.input_size # the log(scale) + self.input_size * 2 # the log1p(abs(loc)) and log(scale) features
) )
...@@ -55,7 +55,7 @@ class TimeSeriesTransformerModelTester: ...@@ -55,7 +55,7 @@ class TimeSeriesTransformerModelTester:
embedding_dimension=5, embedding_dimension=5,
num_time_features=4, num_time_features=4,
is_training=True, is_training=True,
hidden_size=16, hidden_size=64,
num_hidden_layers=2, num_hidden_layers=2,
num_attention_heads=4, num_attention_heads=4,
intermediate_size=4, intermediate_size=4,
...@@ -98,6 +98,7 @@ class TimeSeriesTransformerModelTester: ...@@ -98,6 +98,7 @@ class TimeSeriesTransformerModelTester:
context_length=self.context_length, context_length=self.context_length,
lags_sequence=self.lags_sequence, lags_sequence=self.lags_sequence,
num_time_features=self.num_time_features, num_time_features=self.num_time_features,
num_static_real_features=1,
num_static_categorical_features=1, num_static_categorical_features=1,
cardinality=[self.cardinality], cardinality=[self.cardinality],
embedding_dimension=[self.embedding_dimension], embedding_dimension=[self.embedding_dimension],
...@@ -149,7 +150,7 @@ class TimeSeriesTransformerModelTester: ...@@ -149,7 +150,7 @@ class TimeSeriesTransformerModelTester:
encoder.save_pretrained(tmpdirname) encoder.save_pretrained(tmpdirname)
encoder = TimeSeriesTransformerEncoder.from_pretrained(tmpdirname).to(torch_device) encoder = TimeSeriesTransformerEncoder.from_pretrained(tmpdirname).to(torch_device)
transformer_inputs, _, _ = model.create_network_inputs(**inputs_dict) transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict)
enc_input = transformer_inputs[:, : config.context_length, ...] enc_input = transformer_inputs[:, : config.context_length, ...]
dec_input = transformer_inputs[:, config.context_length :, ...] dec_input = transformer_inputs[:, config.context_length :, ...]
...@@ -186,13 +187,18 @@ class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -186,13 +187,18 @@ class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase):
def setUp(self): def setUp(self):
self.model_tester = TimeSeriesTransformerModelTester(self) self.model_tester = TimeSeriesTransformerModelTester(self)
self.config_tester = ConfigTester(self, config_class=TimeSeriesTransformerConfig, has_text_modality=False) self.config_tester = ConfigTester(
self,
config_class=TimeSeriesTransformerConfig,
has_text_modality=False,
prediction_length=self.model_tester.prediction_length,
)
def test_config(self): def test_config(self):
self.config_tester.run_common_tests() self.config_tester.run_common_tests()
def test_save_load_strict(self): def test_save_load_strict(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs() config, _ = self.model_tester.prepare_config_and_inputs()
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
...@@ -303,7 +309,7 @@ class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -303,7 +309,7 @@ class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase):
) )
out_len = len(outputs) out_len = len(outputs)
correct_outlen = 6 correct_outlen = 7
if "last_hidden_state" in outputs: if "last_hidden_state" in outputs:
correct_outlen += 1 correct_outlen += 1
...@@ -389,13 +395,13 @@ class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase): ...@@ -389,13 +395,13 @@ class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
static_real_features=batch["static_real_features"], static_real_features=batch["static_real_features"],
future_values=batch["future_values"], future_values=batch["future_values"],
future_time_features=batch["future_time_features"], future_time_features=batch["future_time_features"],
)[0] ).last_hidden_state
expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model)) expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
self.assertEqual(output.shape, expected_shape) self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor( expected_slice = torch.tensor(
[[-0.3125, -1.2884, -1.1118], [-0.5801, -1.4907, -0.7782], [0.0849, -1.6557, -0.9755]], device=torch_device [[-0.6322, -1.5771, -0.9340], [-0.1011, -1.0263, -0.7208], [0.4979, -0.6487, -0.7189]], device=torch_device
) )
self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE)) self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
...@@ -412,12 +418,12 @@ class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase): ...@@ -412,12 +418,12 @@ class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
static_categorical_features=batch["static_categorical_features"], static_categorical_features=batch["static_categorical_features"],
static_real_features=batch["static_real_features"], static_real_features=batch["static_real_features"],
future_time_features=batch["future_time_features"], future_time_features=batch["future_time_features"],
)[1] ).encoder_last_hidden_state
expected_shape = torch.Size((64, model.config.prediction_length, model.config.d_model)) expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
self.assertEqual(output.shape, expected_shape) self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor( expected_slice = torch.tensor(
[[0.9127, -0.2056, -0.5259], [1.0572, 1.4104, -0.1964], [0.1358, 2.0348, 0.5739]], device=torch_device [[0.8177, -1.7989, -0.3127], [1.6964, -1.0607, -0.1749], [1.8395, 0.1110, 0.0263]], device=torch_device
) )
self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE)) self.assertTrue(torch.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
...@@ -438,6 +444,6 @@ class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase): ...@@ -438,6 +444,6 @@ class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length)) expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
self.assertEqual(outputs.sequences.shape, expected_shape) self.assertEqual(outputs.sequences.shape, expected_shape)
expected_slice = torch.tensor([2289.5203, 2778.3054, 4648.1313], device=torch_device) expected_slice = torch.tensor([3883.5037, 4630.2251, 7562.1338], device=torch_device)
mean_prediction = outputs.sequences.mean(dim=1) mean_prediction = outputs.sequences.mean(dim=1)
self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1)) self.assertTrue(torch.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment