"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "63caa370e6c618dbe7d3fd4cbf545cc32eca1a15"
Unverified Commit c8be8a9a authored by Nils Reimers's avatar Nils Reimers Committed by GitHub
Browse files

Update model configs - Allow setters for common properties (#13026)

* refactor GPT Config to allow dyn. properties

* make attribute_map a class attribute

* remove old code

* update unit test to test config: Add test for common properties setter

* update unit test to test config: Add test for common properties passed as parameters to __init__

* update to black code format

* Allow that setters are not defined for certain config classes

* update config classes to implement attribute_map

* bugfix lxmert config - id2labels was not defined when num_labels was set

* update broken configs - add attribute_maps

* update bart config

* update black codestyle

* update documentation on common config attributes

* update GPTJ config to new attribute map

* update docs on common attributes

* gptj config: add max_position_embeddings

* gptj config: format with black

* update speech to text 2 config

* format doc file to max_len 119

* update config template
parent cf4eb8b3
...@@ -17,6 +17,11 @@ The base class :class:`~transformers.PretrainedConfig` implements the common met ...@@ -17,6 +17,11 @@ The base class :class:`~transformers.PretrainedConfig` implements the common met
either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
from HuggingFace's AWS S3 repository). from HuggingFace's AWS S3 repository).
Each derived config class implements model specific attributes. Common attributes present in all config classes are:
:obj:`hidden_size`, :obj:`num_attention_heads`, and :obj:`num_hidden_layers`. Text models further implement:
:obj:`vocab_size`.
PretrainedConfig PretrainedConfig
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
...@@ -57,6 +57,8 @@ class PretrainedConfig(PushToHubMixin): ...@@ -57,6 +57,8 @@ class PretrainedConfig(PushToHubMixin):
:class:`~RagConfig`. :class:`~RagConfig`.
- **keys_to_ignore_at_inference** (:obj:`List[str]`) -- A list of keys to ignore by default when looking at - **keys_to_ignore_at_inference** (:obj:`List[str]`) -- A list of keys to ignore by default when looking at
dictionary outputs of the model during inference. dictionary outputs of the model during inference.
- **attribute_map** (:obj:`Dict[str, str]`) -- A dict that maps model specific attribute names to the
standardized naming of attributes.
Common attributes (present in all subclasses) Common attributes (present in all subclasses)
...@@ -218,6 +220,17 @@ class PretrainedConfig(PushToHubMixin): ...@@ -218,6 +220,17 @@ class PretrainedConfig(PushToHubMixin):
""" """
model_type: str = "" model_type: str = ""
is_composition: bool = False is_composition: bool = False
attribute_map: Dict[str, str] = {}
def __setattr__(self, key, value):
if key in super().__getattribute__("attribute_map"):
key = super().__getattribute__("attribute_map")[key]
super().__setattr__(key, value)
def __getattribute__(self, key):
if key != "attribute_map" and key in super().__getattribute__("attribute_map"):
key = super().__getattribute__("attribute_map")[key]
return super().__getattribute__(key)
def __init__(self, **kwargs): def __init__(self, **kwargs):
# Attributes with defaults # Attributes with defaults
...@@ -350,7 +363,7 @@ class PretrainedConfig(PushToHubMixin): ...@@ -350,7 +363,7 @@ class PretrainedConfig(PushToHubMixin):
@num_labels.setter @num_labels.setter
def num_labels(self, num_labels: int): def num_labels(self, num_labels: int):
if self.id2label is None or len(self.id2label) != num_labels: if not hasattr(self, "id2label") or self.id2label is None or len(self.id2label) != num_labels:
self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)} self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
self.label2id = dict(zip(self.id2label.values(), self.id2label.keys())) self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
......
...@@ -109,6 +109,7 @@ class BartConfig(PretrainedConfig): ...@@ -109,6 +109,7 @@ class BartConfig(PretrainedConfig):
""" """
model_type = "bart" model_type = "bart"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
...@@ -141,17 +142,6 @@ class BartConfig(PretrainedConfig): ...@@ -141,17 +142,6 @@ class BartConfig(PretrainedConfig):
forced_eos_token_id=2, forced_eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(
num_labels=num_labels,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
...@@ -174,6 +164,17 @@ class BartConfig(PretrainedConfig): ...@@ -174,6 +164,17 @@ class BartConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
super().__init__(
num_labels=num_labels,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
# ensure backward compatibility for BART CNN models # ensure backward compatibility for BART CNN models
if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False): if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
self.forced_bos_token_id = self.bos_token_id self.forced_bos_token_id = self.bos_token_id
...@@ -182,14 +183,6 @@ class BartConfig(PretrainedConfig): ...@@ -182,14 +183,6 @@ class BartConfig(PretrainedConfig):
"The config can simply be saved and uploaded again to be fixed." "The config can simply be saved and uploaded again to be fixed."
) )
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
class BartOnnxConfig(OnnxConfigWithPast): class BartOnnxConfig(OnnxConfigWithPast):
@property @property
......
...@@ -112,6 +112,11 @@ class BigBirdPegasusConfig(PretrainedConfig): ...@@ -112,6 +112,11 @@ class BigBirdPegasusConfig(PretrainedConfig):
""" """
model_type = "bigbird_pegasus" model_type = "bigbird_pegasus"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_attention_heads": "encoder_attention_heads",
"hidden_size": "d_model",
"attention_probs_dropout_prob": "attention_dropout",
}
def __init__( def __init__(
self, self,
...@@ -146,15 +151,6 @@ class BigBirdPegasusConfig(PretrainedConfig): ...@@ -146,15 +151,6 @@ class BigBirdPegasusConfig(PretrainedConfig):
use_bias=False, use_bias=False,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
...@@ -183,14 +179,11 @@ class BigBirdPegasusConfig(PretrainedConfig): ...@@ -183,14 +179,11 @@ class BigBirdPegasusConfig(PretrainedConfig):
self.num_random_blocks = num_random_blocks self.num_random_blocks = num_random_blocks
self.use_bias = use_bias self.use_bias = use_bias
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.encoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: decoder_start_token_id=decoder_start_token_id,
return self.d_model **kwargs,
)
@property
def attention_probs_dropout_prob(self) -> float:
return self.attention_dropout
...@@ -103,6 +103,7 @@ class BlenderbotConfig(PretrainedConfig): ...@@ -103,6 +103,7 @@ class BlenderbotConfig(PretrainedConfig):
""" """
model_type = "blenderbot" model_type = "blenderbot"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
...@@ -135,17 +136,6 @@ class BlenderbotConfig(PretrainedConfig): ...@@ -135,17 +136,6 @@ class BlenderbotConfig(PretrainedConfig):
forced_eos_token_id=2, forced_eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
...@@ -168,10 +158,13 @@ class BlenderbotConfig(PretrainedConfig): ...@@ -168,10 +158,13 @@ class BlenderbotConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.encoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: decoder_start_token_id=decoder_start_token_id,
return self.d_model encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
...@@ -103,6 +103,7 @@ class BlenderbotSmallConfig(PretrainedConfig): ...@@ -103,6 +103,7 @@ class BlenderbotSmallConfig(PretrainedConfig):
""" """
model_type = "blenderbot-small" model_type = "blenderbot-small"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
...@@ -134,16 +135,6 @@ class BlenderbotSmallConfig(PretrainedConfig): ...@@ -134,16 +135,6 @@ class BlenderbotSmallConfig(PretrainedConfig):
forced_eos_token_id=2, forced_eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
...@@ -166,10 +157,12 @@ class BlenderbotSmallConfig(PretrainedConfig): ...@@ -166,10 +157,12 @@ class BlenderbotSmallConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.encoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: decoder_start_token_id=decoder_start_token_id,
return self.d_model forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
...@@ -81,6 +81,12 @@ class CTRLConfig(PretrainedConfig): ...@@ -81,6 +81,12 @@ class CTRLConfig(PretrainedConfig):
model_type = "ctrl" model_type = "ctrl"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"max_position_embeddings": "n_positions",
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__( def __init__(
self, self,
...@@ -104,7 +110,6 @@ class CTRLConfig(PretrainedConfig): ...@@ -104,7 +110,6 @@ class CTRLConfig(PretrainedConfig):
use_cache=True, use_cache=True,
**kwargs **kwargs
): ):
super().__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
...@@ -125,18 +130,4 @@ class CTRLConfig(PretrainedConfig): ...@@ -125,18 +130,4 @@ class CTRLConfig(PretrainedConfig):
self.summary_proj_to_labels = summary_proj_to_labels self.summary_proj_to_labels = summary_proj_to_labels
self.use_cache = use_cache self.use_cache = use_cache
@property super().__init__(**kwargs)
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.n_embd
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer
...@@ -117,6 +117,10 @@ class DetrConfig(PretrainedConfig): ...@@ -117,6 +117,10 @@ class DetrConfig(PretrainedConfig):
""" """
model_type = "detr" model_type = "detr"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "encoder_attention_heads",
}
def __init__( def __init__(
self, self,
...@@ -154,8 +158,6 @@ class DetrConfig(PretrainedConfig): ...@@ -154,8 +158,6 @@ class DetrConfig(PretrainedConfig):
eos_coefficient=0.1, eos_coefficient=0.1,
**kwargs **kwargs
): ):
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
self.num_queries = num_queries self.num_queries = num_queries
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
...@@ -189,6 +191,7 @@ class DetrConfig(PretrainedConfig): ...@@ -189,6 +191,7 @@ class DetrConfig(PretrainedConfig):
self.bbox_loss_coefficient = bbox_loss_coefficient self.bbox_loss_coefficient = bbox_loss_coefficient
self.giou_loss_coefficient = giou_loss_coefficient self.giou_loss_coefficient = giou_loss_coefficient
self.eos_coefficient = eos_coefficient self.eos_coefficient = eos_coefficient
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
@property @property
def num_attention_heads(self) -> int: def num_attention_heads(self) -> int:
......
...@@ -93,6 +93,11 @@ class DistilBertConfig(PretrainedConfig): ...@@ -93,6 +93,11 @@ class DistilBertConfig(PretrainedConfig):
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "distilbert" model_type = "distilbert"
attribute_map = {
"hidden_size": "dim",
"num_attention_heads": "n_heads",
"num_hidden_layers": "n_layers",
}
def __init__( def __init__(
self, self,
...@@ -112,7 +117,6 @@ class DistilBertConfig(PretrainedConfig): ...@@ -112,7 +117,6 @@ class DistilBertConfig(PretrainedConfig):
pad_token_id=0, pad_token_id=0,
**kwargs **kwargs
): ):
super().__init__(**kwargs, pad_token_id=pad_token_id)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.sinusoidal_pos_embds = sinusoidal_pos_embds self.sinusoidal_pos_embds = sinusoidal_pos_embds
...@@ -126,18 +130,7 @@ class DistilBertConfig(PretrainedConfig): ...@@ -126,18 +130,7 @@ class DistilBertConfig(PretrainedConfig):
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.qa_dropout = qa_dropout self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout self.seq_classif_dropout = seq_classif_dropout
super().__init__(**kwargs, pad_token_id=pad_token_id)
@property
def hidden_size(self):
return self.dim
@property
def num_attention_heads(self):
return self.n_heads
@property
def num_hidden_layers(self):
return self.n_layers
class DistilBertOnnxConfig(OnnxConfig): class DistilBertOnnxConfig(OnnxConfig):
......
...@@ -136,6 +136,6 @@ class FlaubertConfig(XLMConfig): ...@@ -136,6 +136,6 @@ class FlaubertConfig(XLMConfig):
def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs): def __init__(self, layerdrop=0.0, pre_norm=False, pad_token_id=2, bos_token_id=0, **kwargs):
"""Constructs FlaubertConfig.""" """Constructs FlaubertConfig."""
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
self.layerdrop = layerdrop self.layerdrop = layerdrop
self.pre_norm = pre_norm self.pre_norm = pre_norm
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
...@@ -124,6 +124,7 @@ class FSMTConfig(PretrainedConfig): ...@@ -124,6 +124,7 @@ class FSMTConfig(PretrainedConfig):
""" """
model_type = "fsmt" model_type = "fsmt"
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
# update the defaults from config file # update the defaults from config file
def __init__( def __init__(
...@@ -161,18 +162,6 @@ class FSMTConfig(PretrainedConfig): ...@@ -161,18 +162,6 @@ class FSMTConfig(PretrainedConfig):
forced_eos_token_id=2, forced_eos_token_id=2,
**common_kwargs **common_kwargs
): ):
if "hidden_size" in common_kwargs:
raise ValueError("hidden size is called d_model")
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
is_encoder_decoder=is_encoder_decoder,
tie_word_embeddings=tie_word_embeddings,
forced_eos_token_id=forced_eos_token_id,
**common_kwargs,
)
self.langs = langs self.langs = langs
self.src_vocab_size = src_vocab_size self.src_vocab_size = src_vocab_size
self.tgt_vocab_size = tgt_vocab_size self.tgt_vocab_size = tgt_vocab_size
...@@ -196,6 +185,8 @@ class FSMTConfig(PretrainedConfig): ...@@ -196,6 +185,8 @@ class FSMTConfig(PretrainedConfig):
self.early_stopping = early_stopping self.early_stopping = early_stopping
self.decoder = DecoderConfig(vocab_size=tgt_vocab_size, bos_token_id=eos_token_id) self.decoder = DecoderConfig(vocab_size=tgt_vocab_size, bos_token_id=eos_token_id)
if "decoder" in common_kwargs:
del common_kwargs["decoder"]
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
...@@ -205,14 +196,16 @@ class FSMTConfig(PretrainedConfig): ...@@ -205,14 +196,16 @@ class FSMTConfig(PretrainedConfig):
self.dropout = dropout self.dropout = dropout
self.use_cache = use_cache self.use_cache = use_cache
super().__init__(
@property pad_token_id=pad_token_id,
def num_attention_heads(self) -> int: bos_token_id=bos_token_id,
return self.encoder_attention_heads eos_token_id=eos_token_id,
decoder_start_token_id=decoder_start_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: tie_word_embeddings=tie_word_embeddings,
return self.d_model forced_eos_token_id=forced_eos_token_id,
**common_kwargs,
)
def to_dict(self): def to_dict(self):
""" """
......
...@@ -102,6 +102,10 @@ class FunnelConfig(PretrainedConfig): ...@@ -102,6 +102,10 @@ class FunnelConfig(PretrainedConfig):
Whether or not to apply the pooling only to the query or to query, key and values for the attention layers. Whether or not to apply the pooling only to the query or to query, key and values for the attention layers.
""" """
model_type = "funnel" model_type = "funnel"
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "n_head",
}
def __init__( def __init__(
self, self,
...@@ -129,8 +133,6 @@ class FunnelConfig(PretrainedConfig): ...@@ -129,8 +133,6 @@ class FunnelConfig(PretrainedConfig):
pool_q_only=True, pool_q_only=True,
**kwargs **kwargs
): ):
super().__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.block_sizes = block_sizes self.block_sizes = block_sizes
self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
...@@ -165,18 +167,22 @@ class FunnelConfig(PretrainedConfig): ...@@ -165,18 +167,22 @@ class FunnelConfig(PretrainedConfig):
self.truncate_seq = truncate_seq self.truncate_seq = truncate_seq
self.pool_q_only = pool_q_only self.pool_q_only = pool_q_only
@property super().__init__(**kwargs)
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.n_head
@property @property
def num_hidden_layers(self): def num_hidden_layers(self):
return sum(self.block_sizes) return sum(self.block_sizes)
@num_hidden_layers.setter
def num_hidden_layers(self, value):
raise NotImplementedError(
"This model does not support the setting of `num_hidden_layers`. Please set `block_sizes`."
)
@property @property
def num_blocks(self): def num_blocks(self):
return len(self.block_sizes) return len(self.block_sizes)
@num_blocks.setter
def num_blocks(self, value):
raise NotImplementedError("This model does not support the setting of `num_blocks`. Please set `block_sizes`.")
...@@ -130,6 +130,12 @@ class GPT2Config(PretrainedConfig): ...@@ -130,6 +130,12 @@ class GPT2Config(PretrainedConfig):
model_type = "gpt2" model_type = "gpt2"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "n_embd",
"max_position_embeddings": "n_positions",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__( def __init__(
self, self,
...@@ -158,8 +164,6 @@ class GPT2Config(PretrainedConfig): ...@@ -158,8 +164,6 @@ class GPT2Config(PretrainedConfig):
eos_token_id=50256, eos_token_id=50256,
**kwargs **kwargs
): ):
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
...@@ -185,21 +189,7 @@ class GPT2Config(PretrainedConfig): ...@@ -185,21 +189,7 @@ class GPT2Config(PretrainedConfig):
self.bos_token_id = bos_token_id self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id self.eos_token_id = eos_token_id
@property super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.n_embd
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer
class GPT2OnnxConfig(OnnxConfigWithPast): class GPT2OnnxConfig(OnnxConfigWithPast):
......
...@@ -96,6 +96,7 @@ class GPTNeoConfig(PretrainedConfig): ...@@ -96,6 +96,7 @@ class GPTNeoConfig(PretrainedConfig):
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "gpt_neo" model_type = "gpt_neo"
attribute_map = {"num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
def __init__( def __init__(
self, self,
...@@ -124,8 +125,6 @@ class GPTNeoConfig(PretrainedConfig): ...@@ -124,8 +125,6 @@ class GPTNeoConfig(PretrainedConfig):
eos_token_id=50256, eos_token_id=50256,
**kwargs **kwargs
): ):
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size self.hidden_size = hidden_size
...@@ -163,6 +162,8 @@ class GPTNeoConfig(PretrainedConfig): ...@@ -163,6 +162,8 @@ class GPTNeoConfig(PretrainedConfig):
"Please verify the value of `config.attention_types` argument." "Please verify the value of `config.attention_types` argument."
) )
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@staticmethod @staticmethod
def expand_attention_types_params(attention_types): def expand_attention_types_params(attention_types):
attentions = [] attentions = []
...@@ -171,14 +172,6 @@ class GPTNeoConfig(PretrainedConfig): ...@@ -171,14 +172,6 @@ class GPTNeoConfig(PretrainedConfig):
attentions.extend(item[0]) attentions.extend(item[0])
return attentions return attentions
@property
def num_attention_heads(self):
return self.num_heads
@property
def num_hidden_layers(self):
return self.num_layers
def custom_unfold(input, dimension, size, step): def custom_unfold(input, dimension, size, step):
"""Custom torch.Tensor.unfold implementation to enable the export to ONNX.""" """Custom torch.Tensor.unfold implementation to enable the export to ONNX."""
......
...@@ -87,6 +87,12 @@ class GPTJConfig(PretrainedConfig): ...@@ -87,6 +87,12 @@ class GPTJConfig(PretrainedConfig):
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "gptj" model_type = "gptj"
attribute_map = {
"max_position_embeddings": "n_positions",
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__( def __init__(
self, self,
...@@ -111,8 +117,6 @@ class GPTJConfig(PretrainedConfig): ...@@ -111,8 +117,6 @@ class GPTJConfig(PretrainedConfig):
eos_token_id=50256, eos_token_id=50256,
**kwargs **kwargs
): ):
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
...@@ -134,18 +138,4 @@ class GPTJConfig(PretrainedConfig): ...@@ -134,18 +138,4 @@ class GPTJConfig(PretrainedConfig):
self.bos_token_id = bos_token_id self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id self.eos_token_id = eos_token_id
@property super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.n_embd
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer
...@@ -99,6 +99,12 @@ class LEDConfig(PretrainedConfig): ...@@ -99,6 +99,12 @@ class LEDConfig(PretrainedConfig):
>>> configuration = model.config >>> configuration = model.config
""" """
model_type = "led" model_type = "led"
attribute_map = {
"num_attention_heads": "encoder_attention_heads",
"hidden_size": "d_model",
"attention_probs_dropout_prob": "attention_dropout",
"initializer_range": "init_std",
}
def __init__( def __init__(
self, self,
...@@ -130,15 +136,6 @@ class LEDConfig(PretrainedConfig): ...@@ -130,15 +136,6 @@ class LEDConfig(PretrainedConfig):
attention_window: Union[List[int], int] = 512, attention_window: Union[List[int], int] = 512,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_encoder_position_embeddings = max_encoder_position_embeddings self.max_encoder_position_embeddings = max_encoder_position_embeddings
self.max_decoder_position_embeddings = max_decoder_position_embeddings self.max_decoder_position_embeddings = max_decoder_position_embeddings
...@@ -162,18 +159,11 @@ class LEDConfig(PretrainedConfig): ...@@ -162,18 +159,11 @@ class LEDConfig(PretrainedConfig):
self.attention_window = attention_window self.attention_window = attention_window
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.encoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: decoder_start_token_id=decoder_start_token_id,
return self.d_model **kwargs,
)
@property
def attention_probs_dropout_prob(self) -> float:
return self.attention_dropout
@property
def initializer_range(self) -> float:
return self.init_std
...@@ -113,13 +113,13 @@ class LxmertConfig(PretrainedConfig): ...@@ -113,13 +113,13 @@ class LxmertConfig(PretrainedConfig):
""" """
model_type = "lxmert" model_type = "lxmert"
attribute_map = {}
def __init__( def __init__(
self, self,
vocab_size=30522, vocab_size=30522,
hidden_size=768, hidden_size=768,
num_attention_heads=12, num_attention_heads=12,
num_labels=2,
num_qa_labels=9500, num_qa_labels=9500,
num_object_labels=1600, num_object_labels=1600,
num_attr_labels=400, num_attr_labels=400,
...@@ -149,11 +149,9 @@ class LxmertConfig(PretrainedConfig): ...@@ -149,11 +149,9 @@ class LxmertConfig(PretrainedConfig):
output_hidden_states=False, output_hidden_states=False,
**kwargs, **kwargs,
): ):
super().__init__(**kwargs)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.num_labels = num_labels
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob self.hidden_dropout_prob = hidden_dropout_prob
...@@ -179,5 +177,6 @@ class LxmertConfig(PretrainedConfig): ...@@ -179,5 +177,6 @@ class LxmertConfig(PretrainedConfig):
self.visual_attr_loss = visual_attr_loss self.visual_attr_loss = visual_attr_loss
self.visual_feat_loss = visual_feat_loss self.visual_feat_loss = visual_feat_loss
self.output_hidden_states = output_hidden_states self.output_hidden_states = output_hidden_states
self.output_attentions = self.output_attentions self.output_attentions = output_attentions
self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers} self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers}
super().__init__(**kwargs)
...@@ -97,6 +97,7 @@ class M2M100Config(PretrainedConfig): ...@@ -97,6 +97,7 @@ class M2M100Config(PretrainedConfig):
""" """
model_type = "m2m_100" model_type = "m2m_100"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
...@@ -126,15 +127,6 @@ class M2M100Config(PretrainedConfig): ...@@ -126,15 +127,6 @@ class M2M100Config(PretrainedConfig):
eos_token_id=2, eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
...@@ -156,10 +148,11 @@ class M2M100Config(PretrainedConfig): ...@@ -156,10 +148,11 @@ class M2M100Config(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
@property super().__init__(
def num_attention_heads(self) -> int: pad_token_id=pad_token_id,
return self.encoder_attention_heads bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
@property is_encoder_decoder=is_encoder_decoder,
def hidden_size(self) -> int: decoder_start_token_id=decoder_start_token_id,
return self.d_model **kwargs,
)
...@@ -103,6 +103,7 @@ class MarianConfig(PretrainedConfig): ...@@ -103,6 +103,7 @@ class MarianConfig(PretrainedConfig):
""" """
model_type = "marian" model_type = "marian"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
...@@ -133,15 +134,6 @@ class MarianConfig(PretrainedConfig): ...@@ -133,15 +134,6 @@ class MarianConfig(PretrainedConfig):
forced_eos_token_id=0, forced_eos_token_id=0,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
...@@ -163,11 +155,11 @@ class MarianConfig(PretrainedConfig): ...@@ -163,11 +155,11 @@ class MarianConfig(PretrainedConfig):
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
super().__init__(
@property pad_token_id=pad_token_id,
def num_attention_heads(self) -> int: eos_token_id=eos_token_id,
return self.encoder_attention_heads is_encoder_decoder=is_encoder_decoder,
decoder_start_token_id=decoder_start_token_id,
@property forced_eos_token_id=forced_eos_token_id,
def hidden_size(self) -> int: **kwargs,
return self.d_model )
...@@ -107,6 +107,7 @@ class MBartConfig(PretrainedConfig): ...@@ -107,6 +107,7 @@ class MBartConfig(PretrainedConfig):
""" """
model_type = "mbart" model_type = "mbart"
keys_to_ignore_at_inference = ["past_key_values"] keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__( def __init__(
self, self,
...@@ -137,15 +138,6 @@ class MBartConfig(PretrainedConfig): ...@@ -137,15 +138,6 @@ class MBartConfig(PretrainedConfig):
forced_eos_token_id=2, forced_eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.d_model = d_model self.d_model = d_model
...@@ -167,14 +159,14 @@ class MBartConfig(PretrainedConfig): ...@@ -167,14 +159,14 @@ class MBartConfig(PretrainedConfig):
self.num_hidden_layers = encoder_layers self.num_hidden_layers = encoder_layers
self.gradient_checkpointing = gradient_checkpointing self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
super().__init__(
@property pad_token_id=pad_token_id,
def num_attention_heads(self) -> int: bos_token_id=bos_token_id,
return self.encoder_attention_heads eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
@property forced_eos_token_id=forced_eos_token_id,
def hidden_size(self) -> int: **kwargs,
return self.d_model )
class MBartOnnxConfig(OnnxConfigWithPast): class MBartOnnxConfig(OnnxConfigWithPast):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment