"docs/vscode:/vscode.git/clone" did not exist on "26b6ef79d6554a2ffc3b50ec8c68f8688bdff7a2"
Unverified Commit c8be8a9a authored by Nils Reimers's avatar Nils Reimers Committed by GitHub
Browse files

Update model configs - Allow setters for common properties (#13026)

* refactor GPT Config to allow dyn. properties

* make attribute_map a class attribute

* remove old code

* update unit test to test config: Add test for common properties setter

* update unit test to test config: Add test for common properties passed as parameters to __init__

* update to black code format

* Allow that setters are not defined for certain config classes

* update config classes to implement attribute_map

* bugfix lxmert config - id2labels was not defined when num_labels was set

* update broken configs - add attribute_maps

* update bart config

* update black codestyle

* update documentation on common config attributes

* update GPTJ config to new attribute map

* update docs on common attributes

* gptj config: add max_position_embeddings

* gptj config: format with black

* update speech to text 2 config

* format doc file to max_len 119

* update config template
parent cf4eb8b3
......@@ -115,6 +115,12 @@ class OpenAIGPTConfig(PretrainedConfig):
"""
model_type = "openai-gpt"
attribute_map = {
"max_position_embeddings": "n_positions",
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
......@@ -138,8 +144,6 @@ class OpenAIGPTConfig(PretrainedConfig):
summary_first_dropout=0.1,
**kwargs
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.n_ctx = n_ctx
self.n_positions = n_positions
......@@ -158,19 +162,4 @@ class OpenAIGPTConfig(PretrainedConfig):
self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels
@property
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.n_embd
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer
super().__init__(**kwargs)
......@@ -103,6 +103,7 @@ class PegasusConfig(PretrainedConfig):
"""
model_type = "pegasus"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
......@@ -133,15 +134,6 @@ class PegasusConfig(PretrainedConfig):
forced_eos_token_id=1,
**kwargs
):
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
......@@ -163,6 +155,14 @@ class PegasusConfig(PretrainedConfig):
self.num_hidden_layers = encoder_layers
self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
@property
def num_attention_heads(self) -> int:
......
......@@ -97,6 +97,9 @@ class ProphetNetConfig(PretrainedConfig):
"""
model_type = "prophetnet"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_attention_heads": "num_encoder_attention_heads",
}
def __init__(
self,
......@@ -129,15 +132,6 @@ class ProphetNetConfig(PretrainedConfig):
eos_token_id=2,
**kwargs
):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
add_cross_attention=add_cross_attention,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.encoder_ffn_dim = encoder_ffn_dim
......@@ -167,10 +161,22 @@ class ProphetNetConfig(PretrainedConfig):
# 4 Training Args (should be removed soon)
self.gradient_checkpointing = gradient_checkpointing
@property
def num_attention_heads(self) -> int:
return self.num_encoder_attention_heads
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
add_cross_attention=add_cross_attention,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
@property
def num_hidden_layers(self) -> int:
return self.num_encoder_layers + self.num_decoder_layers
@num_hidden_layers.setter
def num_hidden_layers(self, value):
raise NotImplementedError(
"This model does not support the setting of `num_hidden_layers`. Please set `num_encoder_layers` and `num_decoder_layers`."
)
......@@ -158,6 +158,7 @@ class ReformerConfig(PretrainedConfig):
"""
model_type = "reformer"
keys_to_ignore_at_inference = ["past_buckets_states"]
attribute_map = {}
def __init__(
self,
......@@ -196,14 +197,6 @@ class ReformerConfig(PretrainedConfig):
classifier_dropout=None,
**kwargs
):
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_decoder=is_decoder,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
self.hash_seed = hash_seed
self.vocab_size = vocab_size
self.attention_head_size = attention_head_size
......@@ -234,3 +227,10 @@ class ReformerConfig(PretrainedConfig):
self.attn_layers = attn_layers
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_decoder=is_decoder,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
......@@ -110,6 +110,7 @@ class Speech2TextConfig(PretrainedConfig):
"""
model_type = "speech_to_text"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
......@@ -146,15 +147,6 @@ class Speech2TextConfig(PretrainedConfig):
input_channels=1,
**kwargs
):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
self.vocab_size = vocab_size
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
......@@ -191,10 +183,11 @@ class Speech2TextConfig(PretrainedConfig):
f"`config.num_conv_layers = {self.num_conv_layers}`."
)
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
......@@ -89,6 +89,7 @@ class Speech2Text2Config(PretrainedConfig):
"""
model_type = "speech_to_text_2"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "decoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
......@@ -115,14 +116,6 @@ class Speech2Text2Config(PretrainedConfig):
max_target_positions=1024,
**kwargs
):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
self.vocab_size = vocab_size
self.d_model = d_model
self.decoder_ffn_dim = decoder_ffn_dim
......@@ -142,10 +135,10 @@ class Speech2Text2Config(PretrainedConfig):
self.max_source_positions = max_source_positions
self.max_target_positions = max_target_positions
@property
def num_attention_heads(self) -> int:
return self.decoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
......@@ -82,6 +82,7 @@ class T5Config(PretrainedConfig):
"""
model_type = "t5"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
def __init__(
self,
......@@ -104,12 +105,6 @@ class T5Config(PretrainedConfig):
gradient_checkpointing=False,
**kwargs
):
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
**kwargs,
)
self.vocab_size = vocab_size
self.d_model = d_model
self.d_kv = d_kv
......@@ -126,18 +121,12 @@ class T5Config(PretrainedConfig):
self.feed_forward_proj = feed_forward_proj
self.use_cache = use_cache
self.gradient_checkpointing = gradient_checkpointing
@property
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.num_heads
@property
def num_hidden_layers(self):
return self.num_layers
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
**kwargs,
)
class T5OnnxConfig(OnnxConfigWithPast):
......
......@@ -106,6 +106,12 @@ class TransfoXLConfig(PretrainedConfig):
model_type = "transfo-xl"
keys_to_ignore_at_inference = ["mems"]
attribute_map = {
"n_token": "vocab_size",
"hidden_size": "d_model",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
......@@ -137,7 +143,6 @@ class TransfoXLConfig(PretrainedConfig):
eos_token_id=0,
**kwargs
):
super().__init__(eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.cutoffs = []
self.cutoffs.extend(cutoffs)
......@@ -167,6 +172,7 @@ class TransfoXLConfig(PretrainedConfig):
self.proj_init_std = proj_init_std
self.init_std = init_std
self.layer_norm_epsilon = layer_norm_epsilon
super().__init__(eos_token_id=eos_token_id, **kwargs)
@property
def max_position_embeddings(self):
......@@ -174,22 +180,9 @@ class TransfoXLConfig(PretrainedConfig):
logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
return -1
@property
def n_token(self): # Backward compatibility
return self.vocab_size
@n_token.setter
def n_token(self, value): # Backward compatibility
self.vocab_size = value
@property
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer
@max_position_embeddings.setter
def max_position_embeddings(self, value):
# Message copied from Transformer-XL documentation
raise NotImplementedError(
f"The model {self.model_type} is one of the few models that has no sequence length limit."
)
......@@ -146,6 +146,12 @@ class XLMConfig(PretrainedConfig):
"""
model_type = "xlm"
attribute_map = {
"hidden_size": "emb_dim",
"num_attention_heads": "n_heads",
"num_hidden_layers": "n_layers",
"n_words": "vocab_size", # For backward compatibility
}
def __init__(
self,
......@@ -185,7 +191,6 @@ class XLMConfig(PretrainedConfig):
**kwargs
):
"""Constructs XLMConfig."""
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
self.vocab_size = vocab_size
self.emb_dim = emb_dim
self.n_layers = n_layers
......@@ -221,22 +226,4 @@ class XLMConfig(PretrainedConfig):
if "n_words" in kwargs:
self.n_words = kwargs["n_words"]
@property
def n_words(self): # For backward compatibility
return self.vocab_size
@n_words.setter
def n_words(self, value): # For backward compatibility
self.vocab_size = value
@property
def hidden_size(self):
return self.emb_dim
@property
def num_attention_heads(self):
return self.n_heads
@property
def num_hidden_layers(self):
return self.n_layers
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
......@@ -137,6 +137,12 @@ class XLNetConfig(PretrainedConfig):
model_type = "xlnet"
keys_to_ignore_at_inference = ["mems"]
attribute_map = {
"n_token": "vocab_size", # Backward compatibility
"hidden_size": "d_model",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
......@@ -170,7 +176,6 @@ class XLNetConfig(PretrainedConfig):
**kwargs
):
"""Constructs XLNetConfig."""
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.d_model = d_model
self.n_layer = n_layer
......@@ -216,27 +221,16 @@ class XLNetConfig(PretrainedConfig):
self.use_mems_eval = use_mems_eval
self.use_mems_train = use_mems_train
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@property
def max_position_embeddings(self):
logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
return -1
@property
def n_token(self): # Backward compatibility
return self.vocab_size
@n_token.setter
def n_token(self, value): # Backward compatibility
self.vocab_size = value
@property
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer
@max_position_embeddings.setter
def max_position_embeddings(self, value):
# Message copied from Transformer-XL documentation
raise NotImplementedError(
f"The model {self.model_type} is one of the few models that has no sequence length limit."
)
......@@ -137,6 +137,15 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
{% else -%}
keys_to_ignore_at_inference = ["past_key_values"]
{% endif -%}
{% if cookiecutter.is_encoder_decoder_model == "False" %}
{%- else %}
attribute_map = {
"num_attention_heads": "encoder_attention_heads",
"hidden_size": "d_model"
}
{%- endif %}
def __init__(
self,
......@@ -184,18 +193,6 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
eos_token_id=2,
**kwargs
):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
{% if cookiecutter.is_encoder_decoder_model == "False" -%}
{% else -%}
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
{% endif -%}
**kwargs
)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
{% if cookiecutter.is_encoder_decoder_model == "False" -%}
......@@ -232,14 +229,16 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
{% endif -%}
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
{% if cookiecutter.is_encoder_decoder_model == "False" -%}
{% else -%}
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
{% endif -%}
**kwargs
)
{% if cookiecutter.is_encoder_decoder_model == "False" %}
{%- else %}
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
{%- endif %}
\ No newline at end of file
......@@ -34,11 +34,39 @@ class ConfigTester(object):
def create_and_test_config_common_properties(self):
config = self.config_class(**self.inputs_dict)
common_properties = ["hidden_size", "num_attention_heads", "num_hidden_layers"]
# Add common fields for text models
if self.has_text_modality:
self.parent.assertTrue(hasattr(config, "vocab_size"))
self.parent.assertTrue(hasattr(config, "hidden_size"))
self.parent.assertTrue(hasattr(config, "num_attention_heads"))
self.parent.assertTrue(hasattr(config, "num_hidden_layers"))
common_properties.extend(["vocab_size"])
# Test that config has the common properties as getters
for prop in common_properties:
self.parent.assertTrue(hasattr(config, prop), msg=f"`{prop}` does not exist")
# Test that config has the common properties as setter
for idx, name in enumerate(common_properties):
try:
setattr(config, name, idx)
self.parent.assertEqual(
getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}"
)
except NotImplementedError:
# Some models might not be able to implement setters for common_properties
# In that case, a NotImplementedError is raised
pass
# Test if config class can be called with Config(prop_name=..)
for idx, name in enumerate(common_properties):
try:
config = self.config_class(**{name: idx})
self.parent.assertEqual(
getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}"
)
except NotImplementedError:
# Some models might not be able to implement setters for common_properties
# In that case, a NotImplementedError is raised
pass
def create_and_test_config_to_json_string(self):
config = self.config_class(**self.inputs_dict)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment