Commit c7be096c authored by thomwolf's avatar thomwolf
Browse files

Merge branch 'master' into cli

parents 3492a6ec 33adab2b
...@@ -85,7 +85,7 @@ class XxxTokenizer(PreTrainedTokenizer): ...@@ -85,7 +85,7 @@ class XxxTokenizer(PreTrainedTokenizer):
Args: Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
__version__ = "2.2.1" __version__ = "2.2.2"
# Work around to update TensorFlow's absl.logging threshold which alters the # Work around to update TensorFlow's absl.logging threshold which alters the
# default Python logging output behavior when present. # default Python logging output behavior when present.
...@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) # pylint: disable=invalid-name ...@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) # pylint: disable=invalid-name
# Files and general utilities # Files and general utilities
from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
cached_path, add_start_docstrings, add_end_docstrings, cached_path, add_start_docstrings, add_end_docstrings,
WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, MODEL_CARD_NAME,
is_tf_available, is_torch_available) is_tf_available, is_torch_available)
from .data import (is_sklearn_available, from .data import (is_sklearn_available,
...@@ -34,10 +34,14 @@ from .data import (is_sklearn_available, ...@@ -34,10 +34,14 @@ from .data import (is_sklearn_available,
if is_sklearn_available(): if is_sklearn_available():
from .data import glue_compute_metrics, xnli_compute_metrics from .data import glue_compute_metrics, xnli_compute_metrics
# Model Cards
from .model_card import ModelCard
# Tokenizers # Tokenizers
from .tokenization_utils import (PreTrainedTokenizer) from .tokenization_utils import (PreTrainedTokenizer)
from .tokenization_auto import AutoTokenizer from .tokenization_auto import AutoTokenizer
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
from .tokenization_openai import OpenAIGPTTokenizer from .tokenization_openai import OpenAIGPTTokenizer
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
from .tokenization_gpt2 import GPT2Tokenizer from .tokenization_gpt2 import GPT2Tokenizer
...@@ -48,28 +52,29 @@ from .tokenization_roberta import RobertaTokenizer ...@@ -48,28 +52,29 @@ from .tokenization_roberta import RobertaTokenizer
from .tokenization_distilbert import DistilBertTokenizer from .tokenization_distilbert import DistilBertTokenizer
from .tokenization_albert import AlbertTokenizer from .tokenization_albert import AlbertTokenizer
from .tokenization_camembert import CamembertTokenizer from .tokenization_camembert import CamembertTokenizer
from .tokenization_t5 import T5Tokenizer
# Configurations # Configurations
from .configuration_utils import PretrainedConfig from .configuration_utils import PretrainedConfig
from .configuration_auto import AutoConfig from .configuration_auto import AutoConfig, ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
# Modeling # Modeling
if is_torch_available(): if is_torch_available():
from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D) from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
AutoModelWithLMHead, AutoModelForTokenClassification) AutoModelWithLMHead, AutoModelForTokenClassification, ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining, from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
BertForMaskedLM, BertForNextSentencePrediction, BertForMaskedLM, BertForNextSentencePrediction,
...@@ -77,8 +82,8 @@ if is_torch_available(): ...@@ -77,8 +82,8 @@ if is_torch_available():
BertForTokenClassification, BertForQuestionAnswering, BertForTokenClassification, BertForQuestionAnswering,
load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP) load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel, from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel, from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
AdaptiveEmbedding, AdaptiveEmbedding,
load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
...@@ -110,6 +115,9 @@ if is_torch_available(): ...@@ -110,6 +115,9 @@ if is_torch_available():
CamembertForTokenClassification, CamembertForTokenClassification,
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP) CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel,
load_tf_weights_in_t5,
T5_PRETRAINED_MODEL_ARCHIVE_MAP)
from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
AlbertForQuestionAnswering, AlbertForQuestionAnswering,
...@@ -124,7 +132,7 @@ if is_torch_available(): ...@@ -124,7 +132,7 @@ if is_torch_available():
if is_tf_available(): if is_tf_available():
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
TFAutoModelWithLMHead, TFAutoModelForTokenClassification) TFAutoModelWithLMHead, TFAutoModelForTokenClassification, TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings, from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
TFBertModel, TFBertForPreTraining, TFBertModel, TFBertForPreTraining,
...@@ -178,6 +186,10 @@ if is_tf_available(): ...@@ -178,6 +186,10 @@ if is_tf_available():
from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM, from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
TFAlbertForSequenceClassification, TFAlbertForSequenceClassification,
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel,
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
# Optimization # Optimization
from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator) from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
......
...@@ -19,6 +19,7 @@ def main(): ...@@ -19,6 +19,7 @@ def main():
# parser = ArgumentParser('Transformers CLI tool', usage='transformers serve <command> [<args>]') # parser = ArgumentParser('Transformers CLI tool', usage='transformers serve <command> [<args>]')
# commands_parser = parser.add_subparsers(help='transformers-cli command helpers') # commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
# # Register commands # # Register commands
# ServeCommand.register_subcommand(commands_parser) # ServeCommand.register_subcommand(commands_parser)
......
...@@ -19,8 +19,8 @@ class UserCommands(BaseTransformersCLICommand): ...@@ -19,8 +19,8 @@ class UserCommands(BaseTransformersCLICommand):
list_parser.set_defaults(func=lambda args: ListObjsCommand(args)) list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
# upload # upload
upload_parser = parser.add_parser('upload') upload_parser = parser.add_parser('upload')
upload_parser.add_argument('file', type=str, help='Local filepath of the file to upload.') upload_parser.add_argument('path', type=str, help='Local path of the folder or individual file to upload.')
upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override object filename on S3.') upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override individual object filename on S3.')
upload_parser.set_defaults(func=lambda args: UploadCommand(args)) upload_parser.set_defaults(func=lambda args: UploadCommand(args))
...@@ -138,28 +138,57 @@ class ListObjsCommand(BaseUserCommand): ...@@ -138,28 +138,57 @@ class ListObjsCommand(BaseUserCommand):
class UploadCommand(BaseUserCommand): class UploadCommand(BaseUserCommand):
def walk_dir(self, rel_path):
"""
Recursively list all files in a folder.
"""
entries: List[os.DirEntry] = list(os.scandir(rel_path))
files = [
(
os.path.join(os.getcwd(), f.path), # filepath
f.path # filename
)
for f in entries if f.is_file()
]
for f in entries:
if f.is_dir():
files += self.walk_dir(f.path)
return files
def run(self): def run(self):
token = HfFolder.get_token() token = HfFolder.get_token()
if token is None: if token is None:
print("Not logged in") print("Not logged in")
exit(1) exit(1)
filepath = os.path.join(os.getcwd(), self.args.file) local_path = os.path.abspath(self.args.path)
filename = self.args.filename if self.args.filename is not None else os.path.basename(filepath) if os.path.isdir(local_path):
print( if self.args.filename is not None:
"About to upload file {} to S3 under filename {}".format( raise ValueError("Cannot specify a filename override when uploading a folder.")
ANSI.bold(filepath), ANSI.bold(filename) rel_path = os.path.basename(local_path)
files = self.walk_dir(rel_path)
elif os.path.isfile(local_path):
filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path)
files = [(local_path, filename)]
else:
raise ValueError("Not a valid file or directory: {}".format(local_path))
for filepath, filename in files:
print(
"About to upload file {} to S3 under filename {}".format(
ANSI.bold(filepath), ANSI.bold(filename)
)
) )
)
choice = input("Proceed? [Y/n] ").lower() choice = input("Proceed? [Y/n] ").lower()
if not(choice == "" or choice == "y" or choice == "yes"): if not(choice == "" or choice == "y" or choice == "yes"):
print("Abort") print("Abort")
exit() exit()
print( print(
ANSI.bold("Uploading... This might take a while if file is large") ANSI.bold("Uploading... This might take a while if files are large")
) )
access_url = self._api.presign_and_upload( for filepath, filename in files:
token=token, filename=filename, filepath=filepath access_url = self._api.presign_and_upload(
) token=token, filename=filename, filepath=filepath
print("Your file now lives at:") )
print(access_url) print("Your file now lives at:")
print(access_url)
...@@ -37,7 +37,7 @@ class AlbertConfig(PretrainedConfig): ...@@ -37,7 +37,7 @@ class AlbertConfig(PretrainedConfig):
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=30000, vocab_size=30000,
embedding_size=128, embedding_size=128,
hidden_size=4096, hidden_size=4096,
num_hidden_layers=12, num_hidden_layers=12,
...@@ -83,7 +83,7 @@ class AlbertConfig(PretrainedConfig): ...@@ -83,7 +83,7 @@ class AlbertConfig(PretrainedConfig):
""" """
super(AlbertConfig, self).__init__(**kwargs) super(AlbertConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size_or_config_json_file self.vocab_size = vocab_size
self.embedding_size = embedding_size self.embedding_size = embedding_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
...@@ -97,4 +97,4 @@ class AlbertConfig(PretrainedConfig): ...@@ -97,4 +97,4 @@ class AlbertConfig(PretrainedConfig):
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
\ No newline at end of file
...@@ -18,21 +18,40 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -18,21 +18,40 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import logging import logging
from .configuration_bert import BertConfig from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_openai import OpenAIGPTConfig from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_gpt2 import GPT2Config from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_transfo_xl import TransfoXLConfig from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_xlnet import XLNetConfig from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_xlm import XLMConfig from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_roberta import RobertaConfig from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_distilbert import DistilBertConfig from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_ctrl import CTRLConfig from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_camembert import CamembertConfig from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_albert import AlbertConfig from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value)
for pretrained_map in [
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
]
for key, value, in pretrained_map.items())
class AutoConfig(object): class AutoConfig(object):
r""":class:`~transformers.AutoConfig` is a generic configuration class r""":class:`~transformers.AutoConfig` is a generic configuration class
that will be instantiated as one of the configuration classes of the library that will be instantiated as one of the configuration classes of the library
...@@ -96,6 +115,7 @@ class AutoConfig(object): ...@@ -96,6 +115,7 @@ class AutoConfig(object):
The configuration class to instantiate is selected as the first pattern matching The configuration class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: T5Config (T5 model)
- contains `distilbert`: DistilBertConfig (DistilBERT model) - contains `distilbert`: DistilBertConfig (DistilBERT model)
- contains `albert`: AlbertConfig (ALBERT model) - contains `albert`: AlbertConfig (ALBERT model)
- contains `camembert`: CamembertConfig (CamemBERT model) - contains `camembert`: CamembertConfig (CamemBERT model)
...@@ -111,6 +131,7 @@ class AutoConfig(object): ...@@ -111,6 +131,7 @@ class AutoConfig(object):
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
...@@ -151,7 +172,9 @@ class AutoConfig(object): ...@@ -151,7 +172,9 @@ class AutoConfig(object):
assert unused_kwargs == {'foo': False} assert unused_kwargs == {'foo': False}
""" """
if 'distilbert' in pretrained_model_name_or_path: if 't5' in pretrained_model_name_or_path:
return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'albert' in pretrained_model_name_or_path: elif 'albert' in pretrained_model_name_or_path:
return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
......
...@@ -42,6 +42,12 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -42,6 +42,12 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
} }
...@@ -52,7 +58,7 @@ class BertConfig(PretrainedConfig): ...@@ -52,7 +58,7 @@ class BertConfig(PretrainedConfig):
Arguments: Arguments:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer. hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder. num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in num_attention_heads: Number of attention heads for each attention layer in
...@@ -77,7 +83,7 @@ class BertConfig(PretrainedConfig): ...@@ -77,7 +83,7 @@ class BertConfig(PretrainedConfig):
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=30522, vocab_size=30522,
hidden_size=768, hidden_size=768,
num_hidden_layers=12, num_hidden_layers=12,
num_attention_heads=12, num_attention_heads=12,
...@@ -91,25 +97,15 @@ class BertConfig(PretrainedConfig): ...@@ -91,25 +97,15 @@ class BertConfig(PretrainedConfig):
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
**kwargs): **kwargs):
super(BertConfig, self).__init__(**kwargs) super(BertConfig, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 self.vocab_size = vocab_size
and isinstance(vocab_size_or_config_json_file, unicode)): self.hidden_size = hidden_size
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: self.num_hidden_layers = num_hidden_layers
json_config = json.loads(reader.read()) self.num_attention_heads = num_attention_heads
for key, value in json_config.items(): self.hidden_act = hidden_act
self.__dict__[key] = value self.intermediate_size = intermediate_size
elif isinstance(vocab_size_or_config_json_file, int): self.hidden_dropout_prob = hidden_dropout_prob
self.vocab_size = vocab_size_or_config_json_file self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.hidden_size = hidden_size self.max_position_embeddings = max_position_embeddings
self.num_hidden_layers = num_hidden_layers self.type_vocab_size = type_vocab_size
self.num_attention_heads = num_attention_heads self.initializer_range = initializer_range
self.hidden_act = hidden_act self.layer_norm_eps = layer_norm_eps
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
...@@ -31,7 +31,7 @@ class CTRLConfig(PretrainedConfig): ...@@ -31,7 +31,7 @@ class CTRLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `CTRLModel`. """Configuration class to store the configuration of a `CTRLModel`.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
n_positions: Number of positional embeddings. n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). n_ctx: Size of the causal mask (usually same as n_positions).
dff: Size of the inner dimension of the FFN. dff: Size of the inner dimension of the FFN.
...@@ -52,7 +52,7 @@ class CTRLConfig(PretrainedConfig): ...@@ -52,7 +52,7 @@ class CTRLConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
vocab_size_or_config_json_file=246534, vocab_size=246534,
n_positions=256, n_positions=256,
n_ctx=256, n_ctx=256,
n_embd=1280, n_embd=1280,
...@@ -64,8 +64,6 @@ class CTRLConfig(PretrainedConfig): ...@@ -64,8 +64,6 @@ class CTRLConfig(PretrainedConfig):
attn_pdrop=0.1, attn_pdrop=0.1,
layer_norm_epsilon=1e-6, layer_norm_epsilon=1e-6,
initializer_range=0.02, initializer_range=0.02,
num_labels=1,
summary_type='cls_index', summary_type='cls_index',
summary_use_proj=True, summary_use_proj=True,
summary_activation=None, summary_activation=None,
...@@ -76,7 +74,7 @@ class CTRLConfig(PretrainedConfig): ...@@ -76,7 +74,7 @@ class CTRLConfig(PretrainedConfig):
"""Constructs CTRLConfig. """Constructs CTRLConfig.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
n_positions: Number of positional embeddings. n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). n_ctx: Size of the causal mask (usually same as n_positions).
dff: Size of the inner dimension of the FFN. dff: Size of the inner dimension of the FFN.
...@@ -94,8 +92,7 @@ class CTRLConfig(PretrainedConfig): ...@@ -94,8 +92,7 @@ class CTRLConfig(PretrainedConfig):
initializing all weight matrices. initializing all weight matrices.
""" """
super(CTRLConfig, self).__init__(**kwargs) super(CTRLConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size
self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
self.n_embd = n_embd self.n_embd = n_embd
...@@ -108,23 +105,11 @@ class CTRLConfig(PretrainedConfig): ...@@ -108,23 +105,11 @@ class CTRLConfig(PretrainedConfig):
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.num_labels = num_labels
self.summary_type = summary_type self.summary_type = summary_type
self.summary_use_proj = summary_use_proj self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels self.summary_proj_to_labels = summary_proj_to_labels
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif not isinstance(vocab_size_or_config_json_file, int):
raise ValueError(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
@property @property
def max_position_embeddings(self): def max_position_embeddings(self):
......
...@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig): ...@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=30522, vocab_size=30522,
max_position_embeddings=512, max_position_embeddings=512,
sinusoidal_pos_embds=False, sinusoidal_pos_embds=False,
n_layers=6, n_layers=6,
...@@ -53,31 +53,21 @@ class DistilBertConfig(PretrainedConfig): ...@@ -53,31 +53,21 @@ class DistilBertConfig(PretrainedConfig):
seq_classif_dropout=0.2, seq_classif_dropout=0.2,
**kwargs): **kwargs):
super(DistilBertConfig, self).__init__(**kwargs) super(DistilBertConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.sinusoidal_pos_embds = sinusoidal_pos_embds
self.n_layers = n_layers
self.n_heads = n_heads
self.dim = dim
self.hidden_dim = hidden_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation = activation
self.initializer_range = initializer_range
self.tie_weights_ = tie_weights_
self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int):
self.vocab_size = vocab_size_or_config_json_file
self.max_position_embeddings = max_position_embeddings
self.sinusoidal_pos_embds = sinusoidal_pos_embds
self.n_layers = n_layers
self.n_heads = n_heads
self.dim = dim
self.hidden_dim = hidden_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation = activation
self.initializer_range = initializer_range
self.tie_weights_ = tie_weights_
self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property @property
def hidden_size(self): def hidden_size(self):
return self.dim return self.dim
......
...@@ -36,7 +36,7 @@ class GPT2Config(PretrainedConfig): ...@@ -36,7 +36,7 @@ class GPT2Config(PretrainedConfig):
"""Configuration class to store the configuration of a `GPT2Model`. """Configuration class to store the configuration of a `GPT2Model`.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_positions: Number of positional embeddings. n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states. n_embd: Dimensionality of the embeddings and hidden states.
...@@ -56,7 +56,7 @@ class GPT2Config(PretrainedConfig): ...@@ -56,7 +56,7 @@ class GPT2Config(PretrainedConfig):
def __init__( def __init__(
self, self,
vocab_size_or_config_json_file=50257, vocab_size=50257,
n_positions=1024, n_positions=1024,
n_ctx=1024, n_ctx=1024,
n_embd=768, n_embd=768,
...@@ -67,8 +67,6 @@ class GPT2Config(PretrainedConfig): ...@@ -67,8 +67,6 @@ class GPT2Config(PretrainedConfig):
attn_pdrop=0.1, attn_pdrop=0.1,
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
initializer_range=0.02, initializer_range=0.02,
num_labels=1,
summary_type='cls_index', summary_type='cls_index',
summary_use_proj=True, summary_use_proj=True,
summary_activation=None, summary_activation=None,
...@@ -79,7 +77,7 @@ class GPT2Config(PretrainedConfig): ...@@ -79,7 +77,7 @@ class GPT2Config(PretrainedConfig):
"""Constructs GPT2Config. """Constructs GPT2Config.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_positions: Number of positional embeddings. n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states. n_embd: Dimensionality of the embeddings and hidden states.
...@@ -96,37 +94,22 @@ class GPT2Config(PretrainedConfig): ...@@ -96,37 +94,22 @@ class GPT2Config(PretrainedConfig):
initializing all weight matrices. initializing all weight matrices.
""" """
super(GPT2Config, self).__init__(**kwargs) super(GPT2Config, self).__init__(**kwargs)
self.vocab_size = vocab_size
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 self.n_ctx = n_ctx
and isinstance(vocab_size_or_config_json_file, unicode)): self.n_positions = n_positions
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: self.n_embd = n_embd
json_config = json.loads(reader.read()) self.n_layer = n_layer
for key, value in json_config.items(): self.n_head = n_head
self.__dict__[key] = value self.resid_pdrop = resid_pdrop
elif isinstance(vocab_size_or_config_json_file, int): self.embd_pdrop = embd_pdrop
self.vocab_size = vocab_size_or_config_json_file self.attn_pdrop = attn_pdrop
self.n_ctx = n_ctx self.layer_norm_epsilon = layer_norm_epsilon
self.n_positions = n_positions self.initializer_range = initializer_range
self.n_embd = n_embd self.summary_type = summary_type
self.n_layer = n_layer self.summary_use_proj = summary_use_proj
self.n_head = n_head self.summary_activation = summary_activation
self.resid_pdrop = resid_pdrop self.summary_first_dropout = summary_first_dropout
self.embd_pdrop = embd_pdrop self.summary_proj_to_labels = summary_proj_to_labels
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.num_labels = num_labels
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels
else:
raise ValueError(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
@property @property
def max_position_embeddings(self): def max_position_embeddings(self):
......
...@@ -35,7 +35,7 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -35,7 +35,7 @@ class OpenAIGPTConfig(PretrainedConfig):
Configuration class to store the configuration of a `OpenAIGPTModel`. Configuration class to store the configuration of a `OpenAIGPTModel`.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
n_positions: Number of positional embeddings. n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states. n_embd: Dimensionality of the embeddings and hidden states.
...@@ -58,7 +58,7 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -58,7 +58,7 @@ class OpenAIGPTConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
vocab_size_or_config_json_file=40478, vocab_size=40478,
n_positions=512, n_positions=512,
n_ctx=512, n_ctx=512,
n_embd=768, n_embd=768,
...@@ -71,8 +71,6 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -71,8 +71,6 @@ class OpenAIGPTConfig(PretrainedConfig):
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
initializer_range=0.02, initializer_range=0.02,
predict_special_tokens=True, predict_special_tokens=True,
num_labels=1,
summary_type='cls_index', summary_type='cls_index',
summary_use_proj=True, summary_use_proj=True,
summary_activation=None, summary_activation=None,
...@@ -83,39 +81,24 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -83,39 +81,24 @@ class OpenAIGPTConfig(PretrainedConfig):
"""Constructs OpenAIGPTConfig. """Constructs OpenAIGPTConfig.
""" """
super(OpenAIGPTConfig, self).__init__(**kwargs) super(OpenAIGPTConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 self.n_ctx = n_ctx
and isinstance(vocab_size_or_config_json_file, unicode)): self.n_positions = n_positions
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: self.n_embd = n_embd
json_config = json.loads(reader.read()) self.n_layer = n_layer
for key, value in json_config.items(): self.n_head = n_head
self.__dict__[key] = value self.afn = afn
elif isinstance(vocab_size_or_config_json_file, int): self.resid_pdrop = resid_pdrop
self.vocab_size = vocab_size_or_config_json_file self.embd_pdrop = embd_pdrop
self.n_ctx = n_ctx self.attn_pdrop = attn_pdrop
self.n_positions = n_positions self.layer_norm_epsilon = layer_norm_epsilon
self.n_embd = n_embd self.initializer_range = initializer_range
self.n_layer = n_layer self.predict_special_tokens = predict_special_tokens
self.n_head = n_head self.summary_type = summary_type
self.afn = afn self.summary_use_proj = summary_use_proj
self.resid_pdrop = resid_pdrop self.summary_activation = summary_activation
self.embd_pdrop = embd_pdrop self.summary_first_dropout = summary_first_dropout
self.attn_pdrop = attn_pdrop self.summary_proj_to_labels = summary_proj_to_labels
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.predict_special_tokens = predict_special_tokens
self.num_labels = num_labels
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels
else:
raise ValueError(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
@property @property
def max_position_embeddings(self): def max_position_embeddings(self):
......
# coding=utf-8
# Copyright 2010, The T5 Authors and HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" T5 model configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import sys
import six
from io import open
from .configuration_utils import PretrainedConfig
logger = logging.getLogger(__name__)
T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
}
class T5Config(PretrainedConfig):
r"""
:class:`~transformers.T5Config` is the configuration class to store the configuration of a
`T5Model`.
Arguments:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`T5Model`.
initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
layer_norm_eps: The epsilon used by LayerNorm.
"""
pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self,
vocab_size=32128,
n_positions=512,
d_model=512,
d_kv=64,
d_ff=2048,
num_layers=6,
num_heads=8,
relative_attention_num_buckets=32,
dropout_rate=0.1,
layer_norm_epsilon=1e-6,
initializer_factor=1.0,
**kwargs):
super(T5Config, self).__init__(**kwargs)
self.vocab_size = vocab_size
self.n_positions = n_positions
self.d_model = d_model
self.d_kv = d_kv
self.d_ff = d_ff
self.num_layers = num_layers
self.num_heads = num_heads
self.relative_attention_num_buckets = relative_attention_num_buckets
self.dropout_rate = dropout_rate
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_factor = initializer_factor
@property
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.num_heads
@property
def num_hidden_layers(self):
return self.num_layers
...@@ -34,7 +34,7 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -34,7 +34,7 @@ class TransfoXLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `TransfoXLModel`. """Configuration class to store the configuration of a `TransfoXLModel`.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
cutoffs: cutoffs for the adaptive softmax cutoffs: cutoffs for the adaptive softmax
d_model: Dimensionality of the model's hidden states. d_model: Dimensionality of the model's hidden states.
d_embed: Dimensionality of the embeddings d_embed: Dimensionality of the embeddings
...@@ -68,7 +68,7 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -68,7 +68,7 @@ class TransfoXLConfig(PretrainedConfig):
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=267735, vocab_size=267735,
cutoffs=[20000, 40000, 200000], cutoffs=[20000, 40000, 200000],
d_model=1024, d_model=1024,
d_embed=1024, d_embed=1024,
...@@ -100,7 +100,7 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -100,7 +100,7 @@ class TransfoXLConfig(PretrainedConfig):
"""Constructs TransfoXLConfig. """Constructs TransfoXLConfig.
""" """
super(TransfoXLConfig, self).__init__(**kwargs) super(TransfoXLConfig, self).__init__(**kwargs)
self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 self.vocab_size = vocab_size
self.cutoffs = [] self.cutoffs = []
self.cutoffs.extend(cutoffs) self.cutoffs.extend(cutoffs)
self.tie_weight = tie_weight self.tie_weight = tie_weight
...@@ -133,27 +133,17 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -133,27 +133,17 @@ class TransfoXLConfig(PretrainedConfig):
self.init_std = init_std self.init_std = init_std
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif not isinstance(vocab_size_or_config_json_file, int):
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property @property
def max_position_embeddings(self): def max_position_embeddings(self):
return self.tgt_len + self.ext_len + self.mem_len return self.tgt_len + self.ext_len + self.mem_len
@property @property
def vocab_size(self): def n_token(self): # Backward compatibility
return self.n_token return self.vocab_size
@vocab_size.setter @n_token.setter
def vocab_size(self, value): def n_token(self, value): # Backward compatibility
self.n_token = value self.vocab_size = value
@property @property
def hidden_size(self): def hidden_size(self):
......
...@@ -24,7 +24,7 @@ import logging ...@@ -24,7 +24,7 @@ import logging
import os import os
from io import open from io import open
from .file_utils import cached_path, CONFIG_NAME from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -49,8 +49,7 @@ class PretrainedConfig(object): ...@@ -49,8 +49,7 @@ class PretrainedConfig(object):
pretrained_config_archive_map = {} pretrained_config_archive_map = {}
def __init__(self, **kwargs): def __init__(self, **kwargs):
self.finetuning_task = kwargs.pop('finetuning_task', None) # Attributes with defaults
self.num_labels = kwargs.pop('num_labels', 2)
self.output_attentions = kwargs.pop('output_attentions', False) self.output_attentions = kwargs.pop('output_attentions', False)
self.output_hidden_states = kwargs.pop('output_hidden_states', False) self.output_hidden_states = kwargs.pop('output_hidden_states', False)
self.output_past = kwargs.pop('output_past', True) # Not used by all models self.output_past = kwargs.pop('output_past', True) # Not used by all models
...@@ -61,6 +60,22 @@ class PretrainedConfig(object): ...@@ -61,6 +60,22 @@ class PretrainedConfig(object):
self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)}) self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys()))) self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
# Fine-tuning task arguments
self.finetuning_task = kwargs.pop('finetuning_task', None)
self.num_labels = kwargs.pop('num_labels', 2)
self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
self.id2label = dict((int(key), value) for key, value in self.id2label.items())
self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
# Additional attributes without default values
for key, value in kwargs.items():
try:
setattr(self, key, value)
except AttributeError as err:
logger.error("Can't set {} with value {} for {}".format(key, value, self))
raise err
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
""" Save a configuration object to the directory `save_directory`, so that it """ Save a configuration object to the directory `save_directory`, so that it
can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method. can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
...@@ -81,6 +96,7 @@ class PretrainedConfig(object): ...@@ -81,6 +96,7 @@ class PretrainedConfig(object):
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
...@@ -133,12 +149,18 @@ class PretrainedConfig(object): ...@@ -133,12 +149,18 @@ class PretrainedConfig(object):
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path] config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
elif os.path.isdir(pretrained_model_name_or_path): elif os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
else: elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
config_file = pretrained_model_name_or_path config_file = pretrained_model_name_or_path
# redirect to the cache, if necessary else:
config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
try: try:
# Load from URL or cache if already cached
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
proxies=proxies, resume_download=resume_download) proxies=proxies, resume_download=resume_download)
# Load config
config = cls.from_json_file(resolved_config_file)
except EnvironmentError: except EnvironmentError:
if pretrained_model_name_or_path in cls.pretrained_config_archive_map: if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format( msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
...@@ -152,15 +174,18 @@ class PretrainedConfig(object): ...@@ -152,15 +174,18 @@ class PretrainedConfig(object):
config_file, CONFIG_NAME) config_file, CONFIG_NAME)
raise EnvironmentError(msg) raise EnvironmentError(msg)
except json.JSONDecodeError:
msg = "Couldn't reach server at '{}' to download configuration file or " \
"configuration file is not a valid JSON file. " \
"Please check network or file content here: {}.".format(config_file, resolved_config_file)
raise EnvironmentError(msg)
if resolved_config_file == config_file: if resolved_config_file == config_file:
logger.info("loading configuration file {}".format(config_file)) logger.info("loading configuration file {}".format(config_file))
else: else:
logger.info("loading configuration file {} from cache at {}".format( logger.info("loading configuration file {} from cache at {}".format(
config_file, resolved_config_file)) config_file, resolved_config_file))
# Load config
config = cls.from_json_file(resolved_config_file)
if hasattr(config, 'pruned_heads'): if hasattr(config, 'pruned_heads'):
config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items()) config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
...@@ -182,17 +207,15 @@ class PretrainedConfig(object): ...@@ -182,17 +207,15 @@ class PretrainedConfig(object):
@classmethod @classmethod
def from_dict(cls, json_object): def from_dict(cls, json_object):
"""Constructs a `Config` from a Python dictionary of parameters.""" """Constructs a `Config` from a Python dictionary of parameters."""
config = cls(vocab_size_or_config_json_file=-1) return cls(**json_object)
for key, value in json_object.items():
setattr(config, key, value)
return config
@classmethod @classmethod
def from_json_file(cls, json_file): def from_json_file(cls, json_file):
"""Constructs a `BertConfig` from a json file of parameters.""" """Constructs a `Config` from a json file of parameters."""
with open(json_file, "r", encoding='utf-8') as reader: with open(json_file, "r", encoding='utf-8') as reader:
text = reader.read() text = reader.read()
return cls.from_dict(json.loads(text)) dict_obj = json.loads(text)
return cls(**dict_obj)
def __eq__(self, other): def __eq__(self, other):
return self.__dict__ == other.__dict__ return self.__dict__ == other.__dict__
......
...@@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig): ...@@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `XLMModel`. """Configuration class to store the configuration of a `XLMModel`.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`. vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
d_model: Size of the encoder layers and the pooler layer. d_model: Size of the encoder layers and the pooler layer.
n_layer: Number of hidden layers in the Transformer encoder. n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in n_head: Number of attention heads for each attention layer in
...@@ -81,7 +81,7 @@ class XLMConfig(PretrainedConfig): ...@@ -81,7 +81,7 @@ class XLMConfig(PretrainedConfig):
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=30145, vocab_size=30145,
emb_dim=2048, emb_dim=2048,
n_layers=12, n_layers=12,
n_heads=16, n_heads=16,
...@@ -103,9 +103,6 @@ class XLMConfig(PretrainedConfig): ...@@ -103,9 +103,6 @@ class XLMConfig(PretrainedConfig):
unk_index=3, unk_index=3,
mask_index=5, mask_index=5,
is_encoder=True, is_encoder=True,
finetuning_task=None,
num_labels=2,
summary_type='first', summary_type='first',
summary_use_proj=True, summary_use_proj=True,
summary_activation=None, summary_activation=None,
...@@ -117,56 +114,46 @@ class XLMConfig(PretrainedConfig): ...@@ -117,56 +114,46 @@ class XLMConfig(PretrainedConfig):
"""Constructs XLMConfig. """Constructs XLMConfig.
""" """
super(XLMConfig, self).__init__(**kwargs) super(XLMConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 self.emb_dim = emb_dim
and isinstance(vocab_size_or_config_json_file, unicode)): self.n_layers = n_layers
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: self.n_heads = n_heads
json_config = json.loads(reader.read()) self.dropout = dropout
for key, value in json_config.items(): self.attention_dropout = attention_dropout
self.__dict__[key] = value self.gelu_activation = gelu_activation
elif isinstance(vocab_size_or_config_json_file, int): self.sinusoidal_embeddings = sinusoidal_embeddings
self.n_words = vocab_size_or_config_json_file self.causal = causal
self.emb_dim = emb_dim self.asm = asm
self.n_layers = n_layers self.n_langs = n_langs
self.n_heads = n_heads self.use_lang_emb = use_lang_emb
self.dropout = dropout self.layer_norm_eps = layer_norm_eps
self.attention_dropout = attention_dropout self.bos_index = bos_index
self.gelu_activation = gelu_activation self.eos_index = eos_index
self.sinusoidal_embeddings = sinusoidal_embeddings self.pad_index = pad_index
self.causal = causal self.unk_index = unk_index
self.asm = asm self.mask_index = mask_index
self.n_langs = n_langs self.is_encoder = is_encoder
self.use_lang_emb = use_lang_emb self.max_position_embeddings = max_position_embeddings
self.layer_norm_eps = layer_norm_eps self.embed_init_std = embed_init_std
self.bos_index = bos_index self.init_std = init_std
self.eos_index = eos_index self.summary_type = summary_type
self.pad_index = pad_index self.summary_use_proj = summary_use_proj
self.unk_index = unk_index self.summary_activation = summary_activation
self.mask_index = mask_index self.summary_proj_to_labels = summary_proj_to_labels
self.is_encoder = is_encoder self.summary_first_dropout = summary_first_dropout
self.max_position_embeddings = max_position_embeddings self.start_n_top = start_n_top
self.embed_init_std = embed_init_std self.end_n_top = end_n_top
self.init_std = init_std
self.finetuning_task = finetuning_task if "n_words" in kwargs:
self.num_labels = num_labels self.n_words = kwargs["n_words"]
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_proj_to_labels = summary_proj_to_labels
self.summary_first_dropout = summary_first_dropout
self.start_n_top = start_n_top
self.end_n_top = end_n_top
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property @property
def vocab_size(self): def n_words(self): # For backward compatibility
return self.n_words return self.vocab_size
@vocab_size.setter @n_words.setter
def vocab_size(self, value): def n_words(self, value): # For backward compatibility
self.n_words = value self.vocab_size = value
@property @property
def hidden_size(self): def hidden_size(self):
......
...@@ -35,7 +35,7 @@ class XLNetConfig(PretrainedConfig): ...@@ -35,7 +35,7 @@ class XLNetConfig(PretrainedConfig):
"""Configuration class to store the configuration of a ``XLNetModel``. """Configuration class to store the configuration of a ``XLNetModel``.
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
d_model: Size of the encoder layers and the pooler layer. d_model: Size of the encoder layers and the pooler layer.
n_layer: Number of hidden layers in the Transformer encoder. n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in n_head: Number of attention heads for each attention layer in
...@@ -72,28 +72,22 @@ class XLNetConfig(PretrainedConfig): ...@@ -72,28 +72,22 @@ class XLNetConfig(PretrainedConfig):
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=32000, vocab_size=32000,
d_model=1024, d_model=1024,
n_layer=24, n_layer=24,
n_head=16, n_head=16,
d_inner=4096, d_inner=4096,
max_position_embeddings=512,
ff_activation="gelu", ff_activation="gelu",
untie_r=True, untie_r=True,
attn_type="bi", attn_type="bi",
initializer_range=0.02, initializer_range=0.02,
layer_norm_eps=1e-12, layer_norm_eps=1e-12,
dropout=0.1, dropout=0.1,
mem_len=None, mem_len=None,
reuse_len=None, reuse_len=None,
bi_data=False, bi_data=False,
clamp_len=-1, clamp_len=-1,
same_length=False, same_length=False,
finetuning_task=None,
num_labels=2,
summary_type='last', summary_type='last',
summary_use_proj=True, summary_use_proj=True,
summary_activation='tanh', summary_activation='tanh',
...@@ -104,58 +98,45 @@ class XLNetConfig(PretrainedConfig): ...@@ -104,58 +98,45 @@ class XLNetConfig(PretrainedConfig):
"""Constructs XLNetConfig. """Constructs XLNetConfig.
""" """
super(XLNetConfig, self).__init__(**kwargs) super(XLNetConfig, self).__init__(**kwargs)
self.vocab_size = vocab_size
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 self.d_model = d_model
and isinstance(vocab_size_or_config_json_file, unicode)): self.n_layer = n_layer
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: self.n_head = n_head
json_config = json.loads(reader.read()) assert d_model % n_head == 0
for key, value in json_config.items(): self.d_head = d_model // n_head
setattr(config, key, value) self.ff_activation = ff_activation
elif isinstance(vocab_size_or_config_json_file, int): self.d_inner = d_inner
self.n_token = vocab_size_or_config_json_file self.untie_r = untie_r
self.d_model = d_model self.attn_type = attn_type
self.n_layer = n_layer
self.n_head = n_head self.initializer_range = initializer_range
assert d_model % n_head == 0 self.layer_norm_eps = layer_norm_eps
self.d_head = d_model // n_head
self.ff_activation = ff_activation self.dropout = dropout
self.d_inner = d_inner self.mem_len = mem_len
self.untie_r = untie_r self.reuse_len = reuse_len
self.attn_type = attn_type self.bi_data = bi_data
self.clamp_len = clamp_len
self.initializer_range = initializer_range self.same_length = same_length
self.layer_norm_eps = layer_norm_eps
self.summary_type = summary_type
self.dropout = dropout self.summary_use_proj = summary_use_proj
self.mem_len = mem_len self.summary_activation = summary_activation
self.reuse_len = reuse_len self.summary_last_dropout = summary_last_dropout
self.bi_data = bi_data self.start_n_top = start_n_top
self.clamp_len = clamp_len self.end_n_top = end_n_top
self.same_length = same_length
self.finetuning_task = finetuning_task
self.num_labels = num_labels
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_last_dropout = summary_last_dropout
self.start_n_top = start_n_top
self.end_n_top = end_n_top
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property @property
def max_position_embeddings(self): def max_position_embeddings(self):
return -1 return -1
@property @property
def vocab_size(self): def n_token(self): # Backward compatibility
return self.n_token return self.vocab_size
@vocab_size.setter @n_token.setter
def vocab_size(self, value): def n_token(self, value): # Backward compatibility
self.n_token = value self.vocab_size = value
@property @property
def hidden_size(self): def hidden_size(self):
......
...@@ -34,7 +34,8 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model, ...@@ -34,7 +34,8 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model,
RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP) AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP)
if is_torch_available(): if is_torch_available():
import torch import torch
...@@ -48,7 +49,8 @@ if is_torch_available(): ...@@ -48,7 +49,8 @@ if is_torch_available():
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP)
else: else:
(BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
...@@ -59,7 +61,8 @@ else: ...@@ -59,7 +61,8 @@ else:
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) = ( AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = (
None, None, None, None, None, None, None, None,
None, None, None, None,
None, None, None, None,
...@@ -69,6 +72,7 @@ else: ...@@ -69,6 +72,7 @@ else:
None, None, None, None, None, None,
None, None, None, None, None, None,
None, None, None, None,
None, None,
None, None) None, None)
...@@ -90,7 +94,8 @@ MODEL_CLASSES = { ...@@ -90,7 +94,8 @@ MODEL_CLASSES = {
'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), 'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), 'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP), 'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP) 'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP),
} }
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True): def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
...@@ -115,23 +120,21 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file ...@@ -115,23 +120,21 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path) tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path)
if compare_with_pt_model: if compare_with_pt_model:
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] tfo = tf_model(tf_model.dummy_inputs, training=False) # build the network
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False) # build the network
pt_model = pt_model_class.from_pretrained(None, state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu')
pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None,
config=config, config=config,
state_dict=torch.load(pytorch_checkpoint_path, state_dict=state_dict)
map_location='cpu'))
pt_inputs = torch.tensor(inputs_list)
with torch.no_grad(): with torch.no_grad():
pto = pt_model(pt_inputs) pto = pt_model(**pt_model.dummy_inputs)
np_pt = pto[0].detach().numpy() np_pt = pto[0].numpy()
np_tf = tfo[0].numpy() np_tf = tfo[0].numpy()
diff = np.amax(np.abs(np_pt - np_tf)) diff = np.amax(np.abs(np_pt - np_tf))
print("Max absolute difference between models outputs {}".format(diff)) print("Max absolute difference between models outputs {}".format(diff))
assert diff <= 2e-2, "Error, model absolute difference is >2e-2" assert diff <= 2e-2, "Error, model absolute difference is >2e-2: {}".format(diff)
# Save pytorch-model # Save pytorch-model
print("Save TensorFlow model to {}".format(tf_dump_path)) print("Save TensorFlow model to {}".format(tf_dump_path))
...@@ -139,7 +142,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file ...@@ -139,7 +142,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None, def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False): compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False):
assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory" assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
if args_model_type is None: if args_model_type is None:
...@@ -187,13 +190,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc ...@@ -187,13 +190,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
if os.path.isfile(model_shortcut_name): if os.path.isfile(model_shortcut_name):
model_shortcut_name = 'converted_model' model_shortcut_name = 'converted_model'
convert_pt_checkpoint_to_tf(model_type=model_type, convert_pt_checkpoint_to_tf(model_type=model_type,
pytorch_checkpoint_path=model_file, pytorch_checkpoint_path=model_file,
config_file=config_file, config_file=config_file,
tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'), tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
compare_with_pt_model=compare_with_pt_model) compare_with_pt_model=compare_with_pt_model)
os.remove(config_file) if remove_cached_files:
os.remove(model_file) os.remove(config_file)
os.remove(model_file)
if __name__ == "__main__": if __name__ == "__main__":
...@@ -226,6 +231,9 @@ if __name__ == "__main__": ...@@ -226,6 +231,9 @@ if __name__ == "__main__":
parser.add_argument("--use_cached_models", parser.add_argument("--use_cached_models",
action='store_true', action='store_true',
help = "Use cached models if possible instead of updating to latest checkpoint versions.") help = "Use cached models if possible instead of updating to latest checkpoint versions.")
parser.add_argument("--remove_cached_files",
action='store_true',
help = "Remove pytorch models after conversion (save memory when converting in batches).")
parser.add_argument("--only_convert_finetuned_models", parser.add_argument("--only_convert_finetuned_models",
action='store_true', action='store_true',
help = "Only convert finetuned models.") help = "Only convert finetuned models.")
...@@ -245,4 +253,5 @@ if __name__ == "__main__": ...@@ -245,4 +253,5 @@ if __name__ == "__main__":
config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None, config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
compare_with_pt_model=args.compare_with_pt_model, compare_with_pt_model=args.compare_with_pt_model,
use_cached_models=args.use_cached_models, use_cached_models=args.use_cached_models,
remove_cached_files=args.remove_cached_files,
only_convert_finetuned_models=args.only_convert_finetuned_models) only_convert_finetuned_models=args.only_convert_finetuned_models)
...@@ -20,6 +20,13 @@ import argparse ...@@ -20,6 +20,13 @@ import argparse
import logging import logging
import numpy as np import numpy as np
import torch import torch
import pathlib
import fairseq
from packaging import version
if version.parse(fairseq.__version__) < version.parse("0.9.0"):
raise Exception("requires fairseq >= 0.9.0")
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
from fairseq.modules import TransformerSentenceEncoderLayer from fairseq.modules import TransformerSentenceEncoderLayer
...@@ -45,8 +52,9 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -45,8 +52,9 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
""" """
roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
roberta.eval() # disable dropout roberta.eval() # disable dropout
roberta_sent_encoder = roberta.model.decoder.sentence_encoder
config = BertConfig( config = BertConfig(
vocab_size_or_config_json_file=50265, vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
hidden_size=roberta.args.encoder_embed_dim, hidden_size=roberta.args.encoder_embed_dim,
num_hidden_layers=roberta.args.encoder_layers, num_hidden_layers=roberta.args.encoder_layers,
num_attention_heads=roberta.args.encoder_attention_heads, num_attention_heads=roberta.args.encoder_attention_heads,
...@@ -64,7 +72,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -64,7 +72,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
# Now let's copy all the weights. # Now let's copy all the weights.
# Embeddings # Embeddings
roberta_sent_encoder = roberta.model.decoder.sentence_encoder
model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them.
...@@ -79,15 +86,18 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -79,15 +86,18 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
### self attention ### self attention
self_attn: BertSelfAttention = layer.attention.self self_attn: BertSelfAttention = layer.attention.self
assert( assert(
roberta_layer.self_attn.in_proj_weight.shape == torch.Size((3 * config.hidden_size, config.hidden_size)) roberta_layer.self_attn.k_proj.weight.data.shape == \
roberta_layer.self_attn.q_proj.weight.data.shape == \
roberta_layer.self_attn.v_proj.weight.data.shape == \
torch.Size((config.hidden_size, config.hidden_size))
) )
# we use three distinct linear layers so we split the source layer here.
self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:config.hidden_size, :] self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:config.hidden_size] self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[config.hidden_size:2*config.hidden_size, :] self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[config.hidden_size:2*config.hidden_size] self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[2*config.hidden_size:, :] self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[2*config.hidden_size:] self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
### self-attention output ### self-attention output
self_output: BertSelfOutput = layer.attention.output self_output: BertSelfOutput = layer.attention.output
...@@ -151,6 +161,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -151,6 +161,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
if not success: if not success:
raise Exception("Something went wRoNg") raise Exception("Something went wRoNg")
pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}") print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path) model.save_pretrained(pytorch_dump_folder_path)
......
# coding=utf-8
# Copyright 2018 The T5 authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert T5 checkpoint."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import torch
from transformers import T5Config, T5Model, load_tf_weights_in_t5
import logging
logging.basicConfig(level=logging.INFO)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
# Initialise PyTorch model
config = T5Config.from_json_file(config_file)
print("Building PyTorch model from configuration: {}".format(str(config)))
model = T5Model(config)
# Load weights from tf checkpoint
load_tf_weights_in_t5(model, config, tf_checkpoint_path)
# Save pytorch-model
print("Save PyTorch model to {}".format(pytorch_dump_path))
torch.save(model.state_dict(), pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--tf_checkpoint_path",
default = None,
type = str,
required = True,
help = "Path to the TensorFlow checkpoint path.")
parser.add_argument("--config_file",
default = None,
type = str,
required = True,
help = "The config json file corresponding to the pre-trained T5 model. \n"
"This specifies the model architecture.")
parser.add_argument("--pytorch_dump_path",
default = None,
type = str,
required = True,
help = "Path to the output PyTorch model.")
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
args.config_file,
args.pytorch_dump_path)
...@@ -695,7 +695,12 @@ def compute_predictions_log_probs( ...@@ -695,7 +695,12 @@ def compute_predictions_log_probs(
tok_text = " ".join(tok_text.split()) tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens) orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, if hasattr(tokenizer, "do_lower_case"):
do_lower_case = tokenizer.do_lower_case
else:
do_lower_case = tokenizer.do_lowercase_and_remove_accent
final_text = get_final_text(tok_text, orig_text, do_lower_case,
verbose_logging) verbose_logging)
if final_text in seen_predictions: if final_text in seen_predictions:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment