"docs/vscode:/vscode.git/clone" did not exist on "336116d960e19319767808449ec68ad4f8ee15fa"
Commit 982f181a authored by erenup's avatar erenup
Browse files

Merge remote-tracking branch 'origin/master' into run_multiple_choice_add_doc

parents 603b470a 84b9d1c4
...@@ -21,7 +21,7 @@ from io import open ...@@ -21,7 +21,7 @@ from io import open
import torch import torch
from pytorch_transformers.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME, from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
OpenAIGPTConfig, OpenAIGPTConfig,
OpenAIGPTModel, OpenAIGPTModel,
load_tf_weights_in_openai_gpt) load_tf_weights_in_openai_gpt)
......
...@@ -20,7 +20,7 @@ import argparse ...@@ -20,7 +20,7 @@ import argparse
import torch import torch
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from pytorch_transformers.modeling import BertModel from pytorch_transformers import BertModel
def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str): def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
......
...@@ -23,12 +23,12 @@ import torch ...@@ -23,12 +23,12 @@ import torch
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
from fairseq.modules import TransformerSentenceEncoderLayer from fairseq.modules import TransformerSentenceEncoderLayer
from pytorch_transformers.modeling_bert import (BertConfig, BertEncoder, from pytorch_transformers import (BertConfig, BertEncoder,
BertIntermediate, BertLayer, BertIntermediate, BertLayer,
BertModel, BertOutput, BertModel, BertOutput,
BertSelfAttention, BertSelfAttention,
BertSelfOutput) BertSelfOutput)
from pytorch_transformers.modeling_roberta import (RobertaEmbeddings, from pytorch_transformers import (RobertaEmbeddings,
RobertaForMaskedLM, RobertaForMaskedLM,
RobertaForSequenceClassification, RobertaForSequenceClassification,
RobertaModel) RobertaModel)
...@@ -53,6 +53,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -53,6 +53,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
intermediate_size=roberta.args.encoder_ffn_embed_dim, intermediate_size=roberta.args.encoder_ffn_embed_dim,
max_position_embeddings=514, max_position_embeddings=514,
type_vocab_size=1, type_vocab_size=1,
layer_norm_eps=1e-5, # PyTorch default used in fairseq
) )
if classification_head: if classification_head:
config.num_labels = roberta.args.num_classes config.num_labels = roberta.args.num_classes
...@@ -69,7 +70,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -69,7 +70,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them.
model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
model.roberta.embeddings.LayerNorm.variance_epsilon = roberta_sent_encoder.emb_layer_norm.eps
for i in range(config.num_hidden_layers): for i in range(config.num_hidden_layers):
# Encoder: start of layer # Encoder: start of layer
...@@ -98,7 +98,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -98,7 +98,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
self_output.dense.bias = roberta_layer.self_attn.out_proj.bias self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
self_output.LayerNorm.variance_epsilon = roberta_layer.self_attn_layer_norm.eps
### intermediate ### intermediate
intermediate: BertIntermediate = layer.intermediate intermediate: BertIntermediate = layer.intermediate
...@@ -117,7 +116,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -117,7 +116,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
bert_output.dense.bias = roberta_layer.fc2.bias bert_output.dense.bias = roberta_layer.fc2.bias
bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
bert_output.LayerNorm.variance_epsilon = roberta_layer.final_layer_norm.eps
#### end of layer #### end of layer
if classification_head: if classification_head:
...@@ -131,7 +129,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -131,7 +129,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
model.lm_head.layer_norm.variance_epsilon = roberta.model.decoder.lm_head.layer_norm.eps
model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
model.lm_head.bias = roberta.model.decoder.lm_head.bias model.lm_head.bias = roberta.model.decoder.lm_head.bias
...@@ -144,6 +141,8 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -144,6 +141,8 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
else: else:
their_output = roberta.model(input_ids)[0] their_output = roberta.model(input_ids)[0]
print(our_output.shape, their_output.shape) print(our_output.shape, their_output.shape)
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
success = torch.allclose(our_output, their_output, atol=1e-3) success = torch.allclose(our_output, their_output, atol=1e-3)
print( print(
"Do both models output the same tensors?", "Do both models output the same tensors?",
......
...@@ -21,7 +21,7 @@ from __future__ import print_function ...@@ -21,7 +21,7 @@ from __future__ import print_function
import argparse import argparse
import torch import torch
from pytorch_transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
import logging import logging
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
......
...@@ -26,7 +26,7 @@ import torch ...@@ -26,7 +26,7 @@ import torch
import pytorch_transformers.tokenization_transfo_xl as data_utils import pytorch_transformers.tokenization_transfo_xl as data_utils
from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
from pytorch_transformers.modeling_transfo_xl import (TransfoXLConfig, TransfoXLLMHeadModel, from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
load_tf_weights_in_transfo_xl) load_tf_weights_in_transfo_xl)
from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES) from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
......
...@@ -23,7 +23,7 @@ from io import open ...@@ -23,7 +23,7 @@ from io import open
import torch import torch
import numpy import numpy
from pytorch_transformers.modeling_utils import CONFIG_NAME, WEIGHTS_NAME from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
import logging import logging
......
...@@ -22,7 +22,7 @@ import os ...@@ -22,7 +22,7 @@ import os
import argparse import argparse
import torch import torch
from pytorch_transformers.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME, from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
XLNetConfig, XLNetConfig,
XLNetLMHeadModel, XLNetForQuestionAnswering, XLNetLMHeadModel, XLNetForQuestionAnswering,
XLNetForSequenceClassification, XLNetForSequenceClassification,
......
...@@ -9,6 +9,7 @@ import sys ...@@ -9,6 +9,7 @@ import sys
import json import json
import logging import logging
import os import os
import six
import shutil import shutil
import tempfile import tempfile
import fnmatch import fnmatch
...@@ -47,8 +48,35 @@ except (AttributeError, ImportError): ...@@ -47,8 +48,35 @@ except (AttributeError, ImportError):
PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
WEIGHTS_NAME = "pytorch_model.bin"
TF_WEIGHTS_NAME = 'model.ckpt'
CONFIG_NAME = "config.json"
logger = logging.getLogger(__name__) # pylint: disable=invalid-name logger = logging.getLogger(__name__) # pylint: disable=invalid-name
if not six.PY2:
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = ''.join(docstr) + fn.__doc__
return fn
return docstring_decorator
def add_end_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + ''.join(docstr)
return fn
return docstring_decorator
else:
# Not possible to update class docstrings on python2
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
return fn
return docstring_decorator
def add_end_docstrings(*docstr):
def docstring_decorator(fn):
return fn
return docstring_decorator
def url_to_filename(url, etag=None): def url_to_filename(url, etag=None):
""" """
......
...@@ -18,233 +18,480 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -18,233 +18,480 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import logging import logging
import torch from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
import torch.nn as nn from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
from torch.nn import CrossEntropyLoss, MSELoss from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
from torch.nn.parameter import Parameter from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
from .modeling_bert import BertConfig, BertModel from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
from .modeling_gpt2 import GPT2Config, GPT2Model from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
from .modeling_xlnet import XLNetConfig, XLNetModel
from .modeling_xlm import XLMConfig, XLMModel
from .modeling_roberta import RobertaConfig, RobertaModel
from .modeling_distilbert import DistilBertConfig, DistilBertModel
from .modeling_utils import PreTrainedModel, SequenceSummary from .modeling_utils import PreTrainedModel, SequenceSummary
from .file_utils import add_start_docstrings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class AutoConfig(object):
r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class class AutoModel(object):
that will be instantiated as one of the configuration classes of the library r"""
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` :class:`~pytorch_transformers.AutoModel` is a generic model class
that will be instantiated as one of the base model classes of the library
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
class method. class method.
The `from_pretrained()` method take care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
using pattern matching on the `pretrained_model_name_or_path` string. using pattern matching on the `pretrained_model_name_or_path` string.
The base model class to instantiate is selected as the first pattern matching The base model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `bert`: BertConfig (Bert model) - contains `distilbert`: DistilBertModel (DistilBERT model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - contains `roberta`: RobertaModel (RoBERTa model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model) - contains `bert`: BertModel (Bert model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
- contains `xlnet`: XLNetConfig (XLNet model) - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
- contains `xlm`: XLMConfig (XLM model) - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
- contains `roberta`: RobertaConfig (RoBERTa model) - contains `xlnet`: XLNetModel (XLNet model)
- contains `xlm`: XLMModel (XLM model)
This class cannot be instantiated using `__init__()` (throw an error).
This class cannot be instantiated using `__init__()` (throws an error).
""" """
def __init__(self): def __init__(self):
raise EnvironmentError("AutoConfig is designed to be instantiated " raise EnvironmentError("AutoModel is designed to be instantiated "
"using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.") "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiate a one of the configuration classes of the library r""" Instantiates one of the base model classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The configuration class to instantiate is selected as the first pattern matching The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `bert`: BertConfig (Bert model) - contains `distilbert`: DistilBertModel (DistilBERT model)
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) - contains `roberta`: RobertaModel (RoBERTa model)
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model) - contains `bert`: BertModel (Bert model)
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
- contains `xlnet`: XLNetConfig (XLNet model) - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
- contains `xlm`: XLMConfig (XLM model) - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
- contains `roberta`: RobertaConfig (RoBERTa model) - contains `xlnet`: XLNetModel (XLNet model)
- contains `xlm`: XLMModel (XLM model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
Params: Params:
**pretrained_model_name_or_path**: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
or download and cache if not already stored in cache (e.g. 'bert-base-uncased'). - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing a configuration file saved - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
using the `save_pretrained(save_directory)` method. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- a path or url to a saved configuration `file`.
**cache_dir**: (`optional`) string: model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used. configuration should be cached if the standard cache should not be used.
**return_unused_kwargs**: (`optional`) bool:
- If False, then this function returns just the final configuration object. force_download: (`optional`) boolean, default False:
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
ie the part of kwargs which has not been used to update `config` and is otherwise ignored. proxies: (`optional`) dict, default None:
**kwargs**: (`optional`) dict: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
Dictionary of key/value pairs with which to update the configuration object after loading. The proxies are used on each request.
- The values in kwargs of any keys which are configuration attributes will be used
to override the loaded values. output_loading_info: (`optional`) boolean:
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
by the `return_unused_kwargs` keyword parameter.
kwargs: (`optional`) Remaining dictionary of keyword arguments:
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples:: Examples::
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` model = AutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) assert model.config.output_attention == True
assert config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower)
config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
foo=False, return_unused_kwargs=True) model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
assert config.output_attention == True
assert unused_kwargs == {'foo': False}
""" """
if 'distilbert' in pretrained_model_name_or_path: if 'distilbert' in pretrained_model_name_or_path:
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path: elif 'roberta' in pretrained_model_name_or_path:
return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path: elif 'bert' in pretrained_model_name_or_path:
return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'openai-gpt' in pretrained_model_name_or_path: elif 'openai-gpt' in pretrained_model_name_or_path:
return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'gpt2' in pretrained_model_name_or_path: elif 'gpt2' in pretrained_model_name_or_path:
return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs) return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'transfo-xl' in pretrained_model_name_or_path: elif 'transfo-xl' in pretrained_model_name_or_path:
return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path: elif 'xlnet' in pretrained_model_name_or_path:
return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path: elif 'xlm' in pretrained_model_name_or_path:
return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of " raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta'".format(pretrained_model_name_or_path)) "'xlm', 'roberta'".format(pretrained_model_name_or_path))
class AutoModel(object): class AutoModelWithLMHead(object):
r""" r"""
:class:`~pytorch_transformers.AutoModel` is a generic model class :class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
that will be instantiated as one of the base model classes of the library that will be instantiated as one of the language modeling model classes of the library
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)` when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
class method. class method.
The `from_pretrained()` method take care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
using pattern matching on the `pretrained_model_name_or_path` string. using pattern matching on the `pretrained_model_name_or_path` string.
The base model class to instantiate is selected as the first pattern matching The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `bert`: BertModel (Bert model) - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model) - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model) - contains `bert`: BertForMaskedLM (Bert model)
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model) - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
- contains `xlnet`: XLNetModel (XLNet model) - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
- contains `xlm`: XLMModel (XLM model) - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
- contains `roberta`: RobertaModel (RoBERTa model) - contains `xlnet`: XLNetLMHeadModel (XLNet model)
- contains `xlm`: XLMWithLMHeadModel (XLM model)
This class cannot be instantiated using `__init__()` (throw an error). This class cannot be instantiated using `__init__()` (throws an error).
""" """
def __init__(self): def __init__(self):
raise EnvironmentError("AutoModel is designed to be instantiated " raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
"using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` method.") "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiate a one of the base model classes of the library r""" Instantiates one of the language modeling model classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The base model class to instantiate is selected as the first pattern matching The `from_pretrained()` method takes care of returning the correct model class instance
using pattern matching on the `pretrained_model_name_or_path` string.
The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `bert`: BertModel (Bert model) - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model) - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model) - contains `bert`: BertForMaskedLM (Bert model)
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model) - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
- contains `xlnet`: XLNetModel (XLNet model) - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
- contains `xlm`: XLMModel (XLM model) - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
- contains `roberta`: RobertaModel (RoBERTa model) - contains `xlnet`: XLNetLMHeadModel (XLNet model)
- contains `xlm`: XLMWithLMHeadModel (XLM model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()` To train the model, you should first set it back in training mode with `model.train()`
Params: Params:
**pretrained_model_name_or_path**: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model to load from cache
or download and cache if not already stored in cache (e.g. 'bert-base-uncased'). - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing a configuration file saved - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
using the `save_pretrained(save_directory)` method. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
In this case, ``from_tf`` should be set to True and a configuration object should be model_args: (`optional`) Sequence of positional arguments:
provided as `config` argument. This loading option is slower than converting the TensorFlow All remaning positional arguments will be passed to the underlying model's ``__init__`` method
checkpoint in a PyTorch model using the provided conversion scripts and loading
the PyTorch model afterwards. config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
**model_args**: (`optional`) Sequence: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
All remaning positional arguments will be passed to the underlying model's __init__ function
**config**: an optional configuration for the model to use instead of an automatically loaded configuation. - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
Configuration can be automatically loaded when: - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
- the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
**state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded state_dict: (`optional`) dict:
from saved weights file. an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights. This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
a simpler option.
**cache_dir**: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used. configuration should be cached if the standard cache should not be used.
**output_loading_info**: (`optional`) boolean:
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
output_loading_info: (`optional`) boolean:
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
**kwargs**: (`optional`) dict:
Dictionary of key, values to update the configuration object after loading. kwargs: (`optional`) Remaining dictionary of keyword arguments:
Can be used to override selected configuration parameters. E.g. ``output_attention=True``. Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with `config`, **kwargs will be directly passed - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
to the underlying model's __init__ method. - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
- If a configuration is not provided, **kwargs will be first passed to the pretrained
model configuration class loading function (`PretrainedConfig.from_pretrained`).
Each key of **kwargs that corresponds to a configuration attribute
will be used to override said attribute with the supplied **kwargs value.
Remaining keys that do not correspond to any configuration attribute will
be passed to the underlying model's __init__ function.
Examples:: Examples::
model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModelWithLMHead.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
model = AutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = AutoModelWithLMHead.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading model = AutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
assert model.config.output_attention == True assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower) # Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
""" """
if 'distilbert' in pretrained_model_name_or_path: if 'distilbert' in pretrained_model_name_or_path:
return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path: elif 'roberta' in pretrained_model_name_or_path:
return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path: elif 'bert' in pretrained_model_name_or_path:
return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'openai-gpt' in pretrained_model_name_or_path: elif 'openai-gpt' in pretrained_model_name_or_path:
return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'gpt2' in pretrained_model_name_or_path: elif 'gpt2' in pretrained_model_name_or_path:
return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'transfo-xl' in pretrained_model_name_or_path: elif 'transfo-xl' in pretrained_model_name_or_path:
return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path: elif 'xlnet' in pretrained_model_name_or_path:
return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path: elif 'xlm' in pretrained_model_name_or_path:
return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of " raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta'".format(pretrained_model_name_or_path)) "'xlm', 'roberta'".format(pretrained_model_name_or_path))
class AutoModelForSequenceClassification(object):
r"""
:class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
that will be instantiated as one of the sequence classification model classes of the library
when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
class method.
The `from_pretrained()` method takes care of returning the correct model class instance
using pattern matching on the `pretrained_model_name_or_path` string.
The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
- contains `bert`: BertForSequenceClassification (Bert model)
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
- contains `xlm`: XLMForSequenceClassification (XLM model)
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
"using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the sequence classification model classes of the library
from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance
using pattern matching on the `pretrained_model_name_or_path` string.
The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
- contains `bert`: BertForSequenceClassification (Bert model)
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
- contains `xlm`: XLMForSequenceClassification (XLM model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
Params:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
output_loading_info: (`optional`) boolean:
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
kwargs: (`optional`) Remaining dictionary of keyword arguments:
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples::
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
if 'distilbert' in pretrained_model_name_or_path:
return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
class AutoModelForQuestionAnswering(object):
r"""
:class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
that will be instantiated as one of the question answering model classes of the library
when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
class method.
The `from_pretrained()` method takes care of returning the correct model class instance
using pattern matching on the `pretrained_model_name_or_path` string.
The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
- contains `bert`: BertForQuestionAnswering (Bert model)
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
- contains `xlm`: XLMForQuestionAnswering (XLM model)
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
"using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the question answering model classes of the library
from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance
using pattern matching on the `pretrained_model_name_or_path` string.
The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
- contains `bert`: BertForQuestionAnswering (Bert model)
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
- contains `xlm`: XLMForQuestionAnswering (XLM model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
Params:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
output_loading_info: (`optional`) boolean:
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
kwargs: (`optional`) Remaining dictionary of keyword arguments:
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples::
model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
if 'distilbert' in pretrained_model_name_or_path:
return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
...@@ -28,8 +28,9 @@ import torch ...@@ -28,8 +28,9 @@ import torch
from torch import nn from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, from .modeling_utils import PreTrainedModel, prune_linear_layer
prune_linear_layer, add_start_docstrings) from .configuration_bert import BertConfig
from .file_utils import add_start_docstrings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -49,23 +50,6 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -49,23 +50,6 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
} }
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
}
def load_tf_weights_in_bert(model, config, tf_checkpoint_path): def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
""" Load tf checkpoints in a pytorch model. """ Load tf checkpoints in a pytorch model.
""" """
...@@ -149,95 +133,11 @@ def swish(x): ...@@ -149,95 +133,11 @@ def swish(x):
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
class BertConfig(PretrainedConfig):
r"""
:class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
`BertModel`.
Arguments:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm.
"""
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self,
vocab_size_or_config_json_file=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
**kwargs):
super(BertConfig, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int):
self.vocab_size = vocab_size_or_config_json_file
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
try: try:
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
except (ImportError, AttributeError) as e: except (ImportError, AttributeError) as e:
logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .") logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
class BertLayerNorm(nn.Module): BertLayerNorm = torch.nn.LayerNorm
def __init__(self, hidden_size, eps=1e-12):
"""Construct a layernorm module in the TF style (epsilon inside the square root).
"""
super(BertLayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size))
self.variance_epsilon = eps
def forward(self, x):
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias
class BertEmbeddings(nn.Module): class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings. """Construct the embeddings from word, position and token_type embeddings.
...@@ -350,23 +250,30 @@ class BertAttention(nn.Module): ...@@ -350,23 +250,30 @@ class BertAttention(nn.Module):
super(BertAttention, self).__init__() super(BertAttention, self).__init__()
self.self = BertSelfAttention(config) self.self = BertSelfAttention(config)
self.output = BertSelfOutput(config) self.output = BertSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads): def prune_heads(self, heads):
if len(heads) == 0: if len(heads) == 0:
return return
mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads
for head in heads: for head in heads:
# Compute how many pruned heads are before the head and move the index accordingly
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
mask[head] = 0 mask[head] = 0
mask = mask.view(-1).contiguous().eq(1) mask = mask.view(-1).contiguous().eq(1)
index = torch.arange(len(mask))[mask].long() index = torch.arange(len(mask))[mask].long()
# Prune linear layers # Prune linear layers
self.self.query = prune_linear_layer(self.self.query, index) self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index) self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index) self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# Update hyper params
# Update hyper params and store pruned heads
self.self.num_attention_heads = self.self.num_attention_heads - len(heads) self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, input_tensor, attention_mask, head_mask=None): def forward(self, input_tensor, attention_mask, head_mask=None):
self_outputs = self.self(input_tensor, attention_mask, head_mask) self_outputs = self.self(input_tensor, attention_mask, head_mask)
...@@ -544,12 +451,8 @@ class BertPreTrainedModel(PreTrainedModel): ...@@ -544,12 +451,8 @@ class BertPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_bert load_tf_weights = load_tf_weights_in_bert
base_model_prefix = "bert" base_model_prefix = "bert"
def __init__(self, *inputs, **kwargs): def _init_weights(self, module):
super(BertPreTrainedModel, self).__init__(*inputs, **kwargs) """ Initialize the weights """
def init_weights(self, module):
""" Initialize the weights.
"""
if isinstance(module, (nn.Linear, nn.Embedding)): if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization # Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617 # cf https://github.com/pytorch/pytorch/pull/5617
...@@ -606,18 +509,18 @@ BERT_INPUTS_DOCSTRING = r""" ...@@ -606,18 +509,18 @@ BERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`. Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings. Mask to avoid performing attention on padding token indices.
Selected in the range ``[0, config.max_position_embeddings - 1]``. Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs. Segment token indices to indicate first and second portions of the inputs.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token corresponds to a `sentence B` token
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices. Indices of positions of each input sequence tokens in the position embeddings.
Mask values selected in ``[0, 1]``: Selected in the range ``[0, config.max_position_embeddings - 1]``.
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
...@@ -662,7 +565,7 @@ class BertModel(BertPreTrainedModel): ...@@ -662,7 +565,7 @@ class BertModel(BertPreTrainedModel):
self.encoder = BertEncoder(config) self.encoder = BertEncoder(config)
self.pooler = BertPooler(config) self.pooler = BertPooler(config)
self.apply(self.init_weights) self.init_weights()
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
old_embeddings = self.embeddings.word_embeddings old_embeddings = self.embeddings.word_embeddings
...@@ -678,7 +581,7 @@ class BertModel(BertPreTrainedModel): ...@@ -678,7 +581,7 @@ class BertModel(BertPreTrainedModel):
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads) self.encoder.layer[layer].attention.prune_heads(heads)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None): def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
if attention_mask is None: if attention_mask is None:
attention_mask = torch.ones_like(input_ids) attention_mask = torch.ones_like(input_ids)
if token_type_ids is None: if token_type_ids is None:
...@@ -771,7 +674,7 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -771,7 +674,7 @@ class BertForPreTraining(BertPreTrainedModel):
self.bert = BertModel(config) self.bert = BertModel(config)
self.cls = BertPreTrainingHeads(config) self.cls = BertPreTrainingHeads(config)
self.apply(self.init_weights) self.init_weights()
self.tie_weights() self.tie_weights()
def tie_weights(self): def tie_weights(self):
...@@ -781,10 +684,14 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -781,10 +684,14 @@ class BertForPreTraining(BertPreTrainedModel):
self._tie_or_clone_weights(self.cls.predictions.decoder, self._tie_or_clone_weights(self.cls.predictions.decoder,
self.bert.embeddings.word_embeddings) self.bert.embeddings.word_embeddings)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
next_sentence_label=None, position_ids=None, head_mask=None): masked_lm_labels=None, next_sentence_label=None):
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
attention_mask=attention_mask, head_mask=head_mask) outputs = self.bert(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
sequence_output, pooled_output = outputs[:2] sequence_output, pooled_output = outputs[:2]
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
...@@ -839,7 +746,7 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -839,7 +746,7 @@ class BertForMaskedLM(BertPreTrainedModel):
self.bert = BertModel(config) self.bert = BertModel(config)
self.cls = BertOnlyMLMHead(config) self.cls = BertOnlyMLMHead(config)
self.apply(self.init_weights) self.init_weights()
self.tie_weights() self.tie_weights()
def tie_weights(self): def tie_weights(self):
...@@ -849,10 +756,14 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -849,10 +756,14 @@ class BertForMaskedLM(BertPreTrainedModel):
self._tie_or_clone_weights(self.cls.predictions.decoder, self._tie_or_clone_weights(self.cls.predictions.decoder,
self.bert.embeddings.word_embeddings) self.bert.embeddings.word_embeddings)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
position_ids=None, head_mask=None): masked_lm_labels=None):
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
attention_mask=attention_mask, head_mask=head_mask) outputs = self.bert(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output) prediction_scores = self.cls(sequence_output)
...@@ -904,12 +815,17 @@ class BertForNextSentencePrediction(BertPreTrainedModel): ...@@ -904,12 +815,17 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
self.bert = BertModel(config) self.bert = BertModel(config)
self.cls = BertOnlyNSPHead(config) self.cls = BertOnlyNSPHead(config)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
next_sentence_label=None):
outputs = self.bert(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None,
position_ids=None, head_mask=None):
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
attention_mask=attention_mask, head_mask=head_mask)
pooled_output = outputs[1] pooled_output = outputs[1]
seq_relationship_score = self.cls(pooled_output) seq_relationship_score = self.cls(pooled_output)
...@@ -965,12 +881,17 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -965,12 +881,17 @@ class BertForSequenceClassification(BertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
position_ids=None, head_mask=None, labels=None):
outputs = self.bert(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
position_ids=None, head_mask=None):
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
attention_mask=attention_mask, head_mask=head_mask)
pooled_output = outputs[1] pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output) pooled_output = self.dropout(pooled_output)
...@@ -993,45 +914,9 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -993,45 +914,9 @@ class BertForSequenceClassification(BertPreTrainedModel):
@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of @add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
BERT_START_DOCSTRING) BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class BertForMultipleChoice(BertPreTrainedModel): class BertForMultipleChoice(BertPreTrainedModel):
r""" r"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs:
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
(b) For single sequences:
``tokens: [CLS] the dog is hairy . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0``
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Mask to avoid performing attention on padding token indices.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
...@@ -1069,18 +954,23 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1069,18 +954,23 @@ class BertForMultipleChoice(BertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1) self.classifier = nn.Linear(config.hidden_size, 1)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, def forward(self, input_ids, attention_mask=None, token_type_ids=None,
position_ids=None, head_mask=None): position_ids=None, head_mask=None, labels=None):
num_choices = input_ids.shape[1] num_choices = input_ids.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) input_ids = input_ids.view(-1, input_ids.size(-1))
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
outputs = self.bert(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
attention_mask=flat_attention_mask, head_mask=head_mask) outputs = self.bert(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
pooled_output = outputs[1] pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output) pooled_output = self.dropout(pooled_output)
...@@ -1137,12 +1027,17 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1137,12 +1027,17 @@ class BertForTokenClassification(BertPreTrainedModel):
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
position_ids=None, head_mask=None, labels=None):
outputs = self.bert(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
position_ids=None, head_mask=None):
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
attention_mask=attention_mask, head_mask=head_mask)
sequence_output = outputs[0] sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output) sequence_output = self.dropout(sequence_output)
...@@ -1211,12 +1106,17 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1211,12 +1106,17 @@ class BertForQuestionAnswering(BertPreTrainedModel):
self.bert = BertModel(config) self.bert = BertModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
start_positions=None, end_positions=None):
outputs = self.bert(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
end_positions=None, position_ids=None, head_mask=None):
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
attention_mask=attention_mask, head_mask=head_mask)
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output) logits = self.qa_outputs(sequence_output)
......
...@@ -31,7 +31,9 @@ import numpy as np ...@@ -31,7 +31,9 @@ import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from pytorch_transformers.modeling_utils import PretrainedConfig, PreTrainedModel, add_start_docstrings, prune_linear_layer from .modeling_utils import PreTrainedModel, prune_linear_layer
from .configuration_distilbert import DistilBertConfig
from .file_utils import add_start_docstrings
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -42,69 +44,6 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -42,69 +44,6 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin" 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin"
} }
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
}
class DistilBertConfig(PretrainedConfig):
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self,
vocab_size_or_config_json_file=30522,
max_position_embeddings=512,
sinusoidal_pos_embds=True,
n_layers=6,
n_heads=12,
dim=768,
hidden_dim=4*768,
dropout=0.1,
attention_dropout=0.1,
activation='gelu',
initializer_range=0.02,
tie_weights_=True,
qa_dropout=0.1,
seq_classif_dropout=0.2,
**kwargs):
super(DistilBertConfig, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int):
self.vocab_size = vocab_size_or_config_json_file
self.max_position_embeddings = max_position_embeddings
self.sinusoidal_pos_embds = sinusoidal_pos_embds
self.n_layers = n_layers
self.n_heads = n_heads
self.dim = dim
self.hidden_dim = hidden_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation = activation
self.initializer_range = initializer_range
self.tie_weights_ = tie_weights_
self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property
def hidden_size(self):
return self.dim
@property
def num_attention_heads(self):
return self.n_heads
@property
def num_hidden_layers(self):
return self.n_layers
### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ### ### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
def gelu(x): def gelu(x):
...@@ -174,12 +113,16 @@ class MultiHeadSelfAttention(nn.Module): ...@@ -174,12 +113,16 @@ class MultiHeadSelfAttention(nn.Module):
self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
self.pruned_heads = set()
def prune_heads(self, heads): def prune_heads(self, heads):
attention_head_size = self.dim // self.n_heads attention_head_size = self.dim // self.n_heads
if len(heads) == 0: if len(heads) == 0:
return return
mask = torch.ones(self.n_heads, attention_head_size) mask = torch.ones(self.n_heads, attention_head_size)
heads = set(heads) - self.pruned_heads
for head in heads: for head in heads:
head -= sum(1 if h < head else 0 for h in self.pruned_heads)
mask[head] = 0 mask[head] = 0
mask = mask.view(-1).contiguous().eq(1) mask = mask.view(-1).contiguous().eq(1)
index = torch.arange(len(mask))[mask].long() index = torch.arange(len(mask))[mask].long()
...@@ -191,6 +134,7 @@ class MultiHeadSelfAttention(nn.Module): ...@@ -191,6 +134,7 @@ class MultiHeadSelfAttention(nn.Module):
# Update hyper params # Update hyper params
self.n_heads = self.n_heads - len(heads) self.n_heads = self.n_heads - len(heads)
self.dim = attention_head_size * self.n_heads self.dim = attention_head_size * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, query, key, value, mask, head_mask = None): def forward(self, query, key, value, mask, head_mask = None):
""" """
...@@ -395,7 +339,7 @@ class DistilBertPreTrainedModel(PreTrainedModel): ...@@ -395,7 +339,7 @@ class DistilBertPreTrainedModel(PreTrainedModel):
def __init__(self, *inputs, **kwargs): def __init__(self, *inputs, **kwargs):
super(DistilBertPreTrainedModel, self).__init__(*inputs, **kwargs) super(DistilBertPreTrainedModel, self).__init__(*inputs, **kwargs)
def init_weights(self, module): def _init_weights(self, module):
""" Initialize the weights. """ Initialize the weights.
""" """
if isinstance(module, nn.Embedding): if isinstance(module, nn.Embedding):
...@@ -480,7 +424,7 @@ class DistilBertModel(DistilBertPreTrainedModel): ...@@ -480,7 +424,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
self.embeddings = Embeddings(config) # Embeddings self.embeddings = Embeddings(config) # Embeddings
self.transformer = Transformer(config) # Encoder self.transformer = Transformer(config) # Encoder
self.apply(self.init_weights) self.init_weights()
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
old_embeddings = self.embeddings.word_embeddings old_embeddings = self.embeddings.word_embeddings
...@@ -568,7 +512,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): ...@@ -568,7 +512,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
self.vocab_projector = nn.Linear(config.dim, config.vocab_size) self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
self.apply(self.init_weights) self.init_weights()
self.tie_weights() self.tie_weights()
self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
...@@ -580,10 +524,10 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): ...@@ -580,10 +524,10 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
self._tie_or_clone_weights(self.vocab_projector, self._tie_or_clone_weights(self.vocab_projector,
self.distilbert.embeddings.word_embeddings) self.distilbert.embeddings.word_embeddings)
def forward(self, input_ids, attention_mask=None, masked_lm_labels=None, head_mask=None): def forward(self, input_ids, attention_mask=None, head_mask=None, masked_lm_labels=None):
dlbrt_output = self.distilbert(input_ids=input_ids, dlbrt_output = self.distilbert(input_ids=input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
head_mask=head_mask) head_mask=head_mask)
hidden_states = dlbrt_output[0] # (bs, seq_length, dim) hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim)
...@@ -642,12 +586,12 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): ...@@ -642,12 +586,12 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
self.classifier = nn.Linear(config.dim, config.num_labels) self.classifier = nn.Linear(config.dim, config.num_labels)
self.dropout = nn.Dropout(config.seq_classif_dropout) self.dropout = nn.Dropout(config.seq_classif_dropout)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, attention_mask=None, labels=None, head_mask=None): def forward(self, input_ids, attention_mask=None, head_mask=None, labels=None):
distilbert_output = self.distilbert(input_ids=input_ids, distilbert_output = self.distilbert(input_ids=input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
head_mask=head_mask) head_mask=head_mask)
hidden_state = distilbert_output[0] # (bs, seq_len, dim) hidden_state = distilbert_output[0] # (bs, seq_len, dim)
pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = hidden_state[:, 0] # (bs, dim)
pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
...@@ -716,12 +660,12 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): ...@@ -716,12 +660,12 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
assert config.num_labels == 2 assert config.num_labels == 2
self.dropout = nn.Dropout(config.qa_dropout) self.dropout = nn.Dropout(config.qa_dropout)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, attention_mask=None, start_positions=None, end_positions=None, head_mask=None): def forward(self, input_ids, attention_mask=None, head_mask=None, start_positions=None, end_positions=None):
distilbert_output = self.distilbert(input_ids=input_ids, distilbert_output = self.distilbert(input_ids=input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
head_mask=head_mask) head_mask=head_mask)
hidden_states = distilbert_output[0] # (bs, max_query_len, dim) hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim)
......
...@@ -30,19 +30,15 @@ import torch.nn as nn ...@@ -30,19 +30,15 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
PreTrainedModel, prune_conv1d_layer, SequenceSummary, from .configuration_gpt2 import GPT2Config
add_start_docstrings) from .file_utils import add_start_docstrings
from .modeling_bert import BertLayerNorm as LayerNorm
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin", GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin", "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"} "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
""" Load tf checkpoints in a pytorch model """ Load tf checkpoints in a pytorch model
...@@ -102,120 +98,6 @@ def gelu(x): ...@@ -102,120 +98,6 @@ def gelu(x):
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
class GPT2Config(PretrainedConfig):
"""Configuration class to store the configuration of a `GPT2Model`.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(
self,
vocab_size_or_config_json_file=50257,
n_positions=1024,
n_ctx=1024,
n_embd=768,
n_layer=12,
n_head=12,
resid_pdrop=0.1,
embd_pdrop=0.1,
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
num_labels=1,
summary_type='cls_index',
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
**kwargs
):
"""Constructs GPT2Config.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
layer_norm_epsilon: epsilon to use in the layer norm layers
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super(GPT2Config, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int):
self.vocab_size = vocab_size_or_config_json_file
self.n_ctx = n_ctx
self.n_positions = n_positions
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.resid_pdrop = resid_pdrop
self.embd_pdrop = embd_pdrop
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.num_labels = num_labels
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels
else:
raise ValueError(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
@property
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.n_embd
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer
class Attention(nn.Module): class Attention(nn.Module):
def __init__(self, nx, n_ctx, config, scale=False): def __init__(self, nx, n_ctx, config, scale=False):
super(Attention, self).__init__() super(Attention, self).__init__()
...@@ -233,24 +115,31 @@ class Attention(nn.Module): ...@@ -233,24 +115,31 @@ class Attention(nn.Module):
self.c_proj = Conv1D(n_state, nx) self.c_proj = Conv1D(n_state, nx)
self.attn_dropout = nn.Dropout(config.attn_pdrop) self.attn_dropout = nn.Dropout(config.attn_pdrop)
self.resid_dropout = nn.Dropout(config.resid_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop)
self.pruned_heads = set()
def prune_heads(self, heads): def prune_heads(self, heads):
if len(heads) == 0: if len(heads) == 0:
return return
mask = torch.ones(self.n_head, self.split_size // self.n_head) mask = torch.ones(self.n_head, self.split_size // self.n_head)
heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads
for head in heads: for head in heads:
# Compute how many pruned heads are before the head and move the index accordingly
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
mask[head] = 0 mask[head] = 0
mask = mask.view(-1).contiguous().eq(1) mask = mask.view(-1).contiguous().eq(1)
index = torch.arange(len(mask))[mask].long() index = torch.arange(len(mask))[mask].long()
index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)]) index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
# Prune conv1d layers # Prune conv1d layers
self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
# Update hyper params # Update hyper params
self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads)) self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
self.n_head = self.n_head - len(heads) self.n_head = self.n_head - len(heads)
self.pruned_heads = self.pruned_heads.union(heads)
def _attn(self, q, k, v, head_mask=None): def _attn(self, q, k, v, attention_mask=None, head_mask=None):
w = torch.matmul(q, k) w = torch.matmul(q, k)
if self.scale: if self.scale:
w = w / math.sqrt(v.size(-1)) w = w / math.sqrt(v.size(-1))
...@@ -258,6 +147,10 @@ class Attention(nn.Module): ...@@ -258,6 +147,10 @@ class Attention(nn.Module):
b = self.bias[:, :, ns-nd:ns, :ns] b = self.bias[:, :, ns-nd:ns, :ns]
w = w * b - 1e4 * (1 - b) w = w * b - 1e4 * (1 - b)
if attention_mask is not None:
# Apply the attention mask
w = w + attention_mask
w = nn.Softmax(dim=-1)(w) w = nn.Softmax(dim=-1)(w)
w = self.attn_dropout(w) w = self.attn_dropout(w)
...@@ -283,7 +176,7 @@ class Attention(nn.Module): ...@@ -283,7 +176,7 @@ class Attention(nn.Module):
else: else:
return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)
def forward(self, x, layer_past=None, head_mask=None): def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
x = self.c_attn(x) x = self.c_attn(x)
query, key, value = x.split(self.split_size, dim=2) query, key, value = x.split(self.split_size, dim=2)
query = self.split_heads(query) query = self.split_heads(query)
...@@ -295,7 +188,7 @@ class Attention(nn.Module): ...@@ -295,7 +188,7 @@ class Attention(nn.Module):
value = torch.cat((past_value, value), dim=-2) value = torch.cat((past_value, value), dim=-2)
present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking
attn_outputs = self._attn(query, key, value, head_mask) attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
a = attn_outputs[0] a = attn_outputs[0]
a = self.merge_heads(a) a = self.merge_heads(a)
...@@ -325,13 +218,16 @@ class Block(nn.Module): ...@@ -325,13 +218,16 @@ class Block(nn.Module):
def __init__(self, n_ctx, config, scale=False): def __init__(self, n_ctx, config, scale=False):
super(Block, self).__init__() super(Block, self).__init__()
nx = config.n_embd nx = config.n_embd
self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
self.attn = Attention(nx, n_ctx, config, scale) self.attn = Attention(nx, n_ctx, config, scale)
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
self.mlp = MLP(4 * nx, config) self.mlp = MLP(4 * nx, config)
def forward(self, x, layer_past=None, head_mask=None): def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask) output_attn = self.attn(self.ln_1(x),
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask)
a = output_attn[0] # output_attn: a, present, (attentions) a = output_attn[0] # output_attn: a, present, (attentions)
x = x + a x = x + a
...@@ -354,7 +250,7 @@ class GPT2PreTrainedModel(PreTrainedModel): ...@@ -354,7 +250,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
def __init__(self, *inputs, **kwargs): def __init__(self, *inputs, **kwargs):
super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs) super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs)
def init_weights(self, module): def _init_weights(self, module):
""" Initialize the weights. """ Initialize the weights.
""" """
if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
...@@ -363,7 +259,7 @@ class GPT2PreTrainedModel(PreTrainedModel): ...@@ -363,7 +259,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
elif isinstance(module, LayerNorm): elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_() module.bias.data.zero_()
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
...@@ -394,20 +290,24 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs: ...@@ -394,20 +290,24 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
the right rather than the left. the right rather than the left.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`. Indices can be obtained using :class:`pytorch_transformers.GPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**past**: **past**:
list of ``torch.FloatTensor`` (one for each layer): list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding. (see `past` output below). Can be used to speed up sequential decoding.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
...@@ -451,9 +351,9 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -451,9 +351,9 @@ class GPT2Model(GPT2PreTrainedModel):
self.wpe = nn.Embedding(config.n_positions, config.n_embd) self.wpe = nn.Embedding(config.n_positions, config.n_embd)
self.drop = nn.Dropout(config.embd_pdrop) self.drop = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
self.apply(self.init_weights) self.init_weights()
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
self.wte = self._get_resized_embeddings(self.wte, new_num_tokens) self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
...@@ -466,7 +366,7 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -466,7 +366,7 @@ class GPT2Model(GPT2PreTrainedModel):
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.h[layer].attn.prune_heads(heads) self.h[layer].attn.prune_heads(heads)
def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None): def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
if past is None: if past is None:
past_length = 0 past_length = 0
past = [None] * len(self.h) past = [None] * len(self.h)
...@@ -476,6 +376,23 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -476,6 +376,23 @@ class GPT2Model(GPT2PreTrainedModel):
position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device) position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids) position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
# Attention mask.
if attention_mask is not None:
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
attention_mask = (1.0 - attention_mask) * -10000.0
# Prepare head mask if needed # Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head # 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N # attention_probs has shape bsz x n_heads x N x N
...@@ -513,7 +430,11 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -513,7 +430,11 @@ class GPT2Model(GPT2PreTrainedModel):
if self.output_hidden_states: if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
outputs = block(hidden_states, layer_past, head_mask[i]) outputs = block(hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask[i])
hidden_states, present = outputs[:2] hidden_states, present = outputs[:2]
presents = presents + (present,) presents = presents + (present,)
...@@ -568,8 +489,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -568,8 +489,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
Examples:: Examples::
import torch
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids) outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2] loss, logits = outputs[:2]
...@@ -580,7 +505,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -580,7 +505,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
self.transformer = GPT2Model(config) self.transformer = GPT2Model(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.apply(self.init_weights) self.init_weights()
self.tie_weights() self.tie_weights()
def tie_weights(self): def tie_weights(self):
...@@ -590,9 +515,14 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -590,9 +515,14 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
self._tie_or_clone_weights(self.lm_head, self._tie_or_clone_weights(self.lm_head,
self.transformer.wte) self.transformer.wte)
def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None): def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, labels=None):
past=past, head_mask=head_mask) transformer_outputs = self.transformer(input_ids,
past=past,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states) lm_logits = self.lm_head(hidden_states)
...@@ -615,33 +545,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -615,33 +545,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings, The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence). the classification head takes as input the input of a specified classification token index in the input sequence).
""", GPT2_START_DOCSTRING) """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
class GPT2DoubleHeadsModel(GPT2PreTrainedModel): class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
r""" Inputs: r"""
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Index of the classification token in each input sequence. Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``. Selected in the range ``[0, input_ids.size(-1) - 1[``.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**past**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling. Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
...@@ -676,13 +585,25 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -676,13 +585,25 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
Examples:: Examples::
import torch
from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2DoubleHeadsModel.from_pretrained('gpt2') model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
# Add a [CLS] to the vocabulary (we should train it also!)
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices encoded_choices = [tokenizer.encode(s) for s in choices]
mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1 cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
outputs = model(input_ids, mc_token_ids)
input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2] lm_prediction_scores, mc_prediction_scores = outputs[:2]
""" """
...@@ -692,7 +613,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -692,7 +613,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.multiple_choice_head = SequenceSummary(config) self.multiple_choice_head = SequenceSummary(config)
self.apply(self.init_weights) self.init_weights()
self.tie_weights()
def tie_weights(self): def tie_weights(self):
""" Make sure we are sharing the input and output embeddings. """ Make sure we are sharing the input and output embeddings.
...@@ -701,10 +623,15 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -701,10 +623,15 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
self._tie_or_clone_weights(self.lm_head, self._tie_or_clone_weights(self.lm_head,
self.transformer.wte) self.transformer.wte)
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None, def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
position_ids=None, past=None, head_mask=None): mc_token_ids=None, lm_labels=None, mc_labels=None):
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, transformer_outputs = self.transformer(input_ids,
past=past, head_mask=head_mask) past=past,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states) lm_logits = self.lm_head(hidden_states)
......
...@@ -30,15 +30,13 @@ import torch.nn as nn ...@@ -30,15 +30,13 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
PreTrainedModel, prune_conv1d_layer, SequenceSummary, from .configuration_openai import OpenAIGPTConfig
add_start_docstrings) from .file_utils import add_start_docstrings
from .modeling_bert import BertLayerNorm as LayerNorm
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"} OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
...@@ -127,111 +125,6 @@ def swish(x): ...@@ -127,111 +125,6 @@ def swish(x):
ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu} ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
class OpenAIGPTConfig(PretrainedConfig):
"""
Configuration class to store the configuration of a `OpenAIGPTModel`.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
afn: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
resid_pdrop: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attn_pdrop: The dropout ratio for the attention
probabilities.
embd_pdrop: The dropout ratio for the embeddings.
layer_norm_epsilon: epsilon to use in the layer norm layers
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
predict_special_tokens: should we predict special tokens (when the model has a LM head)
"""
pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(
self,
vocab_size_or_config_json_file=40478,
n_positions=512,
n_ctx=512,
n_embd=768,
n_layer=12,
n_head=12,
afn="gelu",
resid_pdrop=0.1,
embd_pdrop=0.1,
attn_pdrop=0.1,
layer_norm_epsilon=1e-5,
initializer_range=0.02,
predict_special_tokens=True,
num_labels=1,
summary_type='cls_index',
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
**kwargs
):
"""Constructs OpenAIGPTConfig.
"""
super(OpenAIGPTConfig, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int):
self.vocab_size = vocab_size_or_config_json_file
self.n_ctx = n_ctx
self.n_positions = n_positions
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.afn = afn
self.resid_pdrop = resid_pdrop
self.embd_pdrop = embd_pdrop
self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.predict_special_tokens = predict_special_tokens
self.num_labels = num_labels
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_first_dropout = summary_first_dropout
self.summary_proj_to_labels = summary_proj_to_labels
else:
raise ValueError(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
@property
def max_position_embeddings(self):
return self.n_positions
@property
def hidden_size(self):
return self.n_embd
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer
class Attention(nn.Module): class Attention(nn.Module):
def __init__(self, nx, n_ctx, config, scale=False): def __init__(self, nx, n_ctx, config, scale=False):
super(Attention, self).__init__() super(Attention, self).__init__()
...@@ -249,12 +142,15 @@ class Attention(nn.Module): ...@@ -249,12 +142,15 @@ class Attention(nn.Module):
self.c_proj = Conv1D(n_state, nx) self.c_proj = Conv1D(n_state, nx)
self.attn_dropout = nn.Dropout(config.attn_pdrop) self.attn_dropout = nn.Dropout(config.attn_pdrop)
self.resid_dropout = nn.Dropout(config.resid_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop)
self.pruned_heads = set()
def prune_heads(self, heads): def prune_heads(self, heads):
if len(heads) == 0: if len(heads) == 0:
return return
mask = torch.ones(self.n_head, self.split_size // self.n_head) mask = torch.ones(self.n_head, self.split_size // self.n_head)
heads = set(heads) - self.pruned_heads
for head in heads: for head in heads:
head -= sum(1 if h < head else 0 for h in self.pruned_heads)
mask[head] = 0 mask[head] = 0
mask = mask.view(-1).contiguous().eq(1) mask = mask.view(-1).contiguous().eq(1)
index = torch.arange(len(mask))[mask].long() index = torch.arange(len(mask))[mask].long()
...@@ -265,8 +161,9 @@ class Attention(nn.Module): ...@@ -265,8 +161,9 @@ class Attention(nn.Module):
# Update hyper params # Update hyper params
self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads)) self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
self.n_head = self.n_head - len(heads) self.n_head = self.n_head - len(heads)
self.pruned_heads = self.pruned_heads.union(heads)
def _attn(self, q, k, v, head_mask=None): def _attn(self, q, k, v, attention_mask=None, head_mask=None):
w = torch.matmul(q, k) w = torch.matmul(q, k)
if self.scale: if self.scale:
w = w / math.sqrt(v.size(-1)) w = w / math.sqrt(v.size(-1))
...@@ -275,6 +172,10 @@ class Attention(nn.Module): ...@@ -275,6 +172,10 @@ class Attention(nn.Module):
b = self.bias[:, :, : w.size(-2), : w.size(-1)] b = self.bias[:, :, : w.size(-2), : w.size(-1)]
w = w * b + -1e9 * (1 - b) w = w * b + -1e9 * (1 - b)
if attention_mask is not None:
# Apply the attention mask
w = w + attention_mask
w = nn.Softmax(dim=-1)(w) w = nn.Softmax(dim=-1)(w)
w = self.attn_dropout(w) w = self.attn_dropout(w)
...@@ -300,14 +201,14 @@ class Attention(nn.Module): ...@@ -300,14 +201,14 @@ class Attention(nn.Module):
else: else:
return x.permute(0, 2, 1, 3) return x.permute(0, 2, 1, 3)
def forward(self, x, head_mask=None): def forward(self, x, attention_mask=None, head_mask=None):
x = self.c_attn(x) x = self.c_attn(x)
query, key, value = x.split(self.split_size, dim=2) query, key, value = x.split(self.split_size, dim=2)
query = self.split_heads(query) query = self.split_heads(query)
key = self.split_heads(key, k=True) key = self.split_heads(key, k=True)
value = self.split_heads(value) value = self.split_heads(value)
attn_outputs = self._attn(query, key, value, head_mask) attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
a = attn_outputs[0] a = attn_outputs[0]
a = self.merge_heads(a) a = self.merge_heads(a)
...@@ -338,12 +239,12 @@ class Block(nn.Module): ...@@ -338,12 +239,12 @@ class Block(nn.Module):
super(Block, self).__init__() super(Block, self).__init__()
nx = config.n_embd nx = config.n_embd
self.attn = Attention(nx, n_ctx, config, scale) self.attn = Attention(nx, n_ctx, config, scale)
self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
self.mlp = MLP(4 * nx, config) self.mlp = MLP(4 * nx, config)
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon)
def forward(self, x, head_mask=None): def forward(self, x, attention_mask=None, head_mask=None):
attn_outputs = self.attn(x, head_mask=head_mask) attn_outputs = self.attn(x, attention_mask=attention_mask, head_mask=head_mask)
a = attn_outputs[0] a = attn_outputs[0]
n = self.ln_1(x + a) n = self.ln_1(x + a)
...@@ -363,10 +264,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel): ...@@ -363,10 +264,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_openai_gpt load_tf_weights = load_tf_weights_in_openai_gpt
base_model_prefix = "transformer" base_model_prefix = "transformer"
def __init__(self, *inputs, **kwargs): def _init_weights(self, module):
super(OpenAIGPTPreTrainedModel, self).__init__(*inputs, **kwargs)
def init_weights(self, module):
""" Initialize the weights. """ Initialize the weights.
""" """
if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
...@@ -375,7 +273,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel): ...@@ -375,7 +273,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
elif isinstance(module, LayerNorm): elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_() module.bias.data.zero_()
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
...@@ -409,17 +307,17 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs: ...@@ -409,17 +307,17 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`. Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
...@@ -460,7 +358,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -460,7 +358,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
self.drop = nn.Dropout(config.embd_pdrop) self.drop = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
self.apply(self.init_weights) self.init_weights()
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens) self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
...@@ -473,7 +371,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -473,7 +371,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.h[layer].attn.prune_heads(heads) self.h[layer].attn.prune_heads(heads)
def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None): def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
if position_ids is None: if position_ids is None:
# This was used when we had a single embedding matrice from position and token embeddings # This was used when we had a single embedding matrice from position and token embeddings
# start = self.config.vocab_size + self.config.n_special # start = self.config.vocab_size + self.config.n_special
...@@ -482,6 +380,23 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -482,6 +380,23 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device) position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids) position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
# Attention mask.
if attention_mask is not None:
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
attention_mask = (1.0 - attention_mask) * -10000.0
# Prepare head mask if needed # Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head # 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N # attention_probs has shape bsz x n_heads x N x N
...@@ -518,7 +433,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -518,7 +433,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
if self.output_hidden_states: if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
outputs = block(hidden_states, head_mask[i]) outputs = block(hidden_states, attention_mask, head_mask[i])
hidden_states = outputs[0] hidden_states = outputs[0]
if self.output_attentions: if self.output_attentions:
all_attentions = all_attentions + (outputs[1],) all_attentions = all_attentions + (outputs[1],)
...@@ -573,7 +488,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -573,7 +488,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
self.transformer = OpenAIGPTModel(config) self.transformer = OpenAIGPTModel(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.apply(self.init_weights) self.init_weights()
self.tie_weights() self.tie_weights()
def tie_weights(self): def tie_weights(self):
...@@ -583,8 +498,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -583,8 +498,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
self._tie_or_clone_weights(self.lm_head, self._tie_or_clone_weights(self.lm_head,
self.transformer.tokens_embed) self.transformer.tokens_embed)
def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, head_mask=None): def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, labels=None):
transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask) head_mask=head_mask)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states) lm_logits = self.lm_head(hidden_states)
...@@ -607,40 +526,19 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -607,40 +526,19 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings, The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence). the classification head takes as input the input of a specified classification token index in the input sequence).
""", OPENAI_GPT_START_DOCSTRING) """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
r""" Inputs: r"""
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Index of the classification token in each input sequence. Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``. Selected in the range ``[0, input_ids.size(-1) - 1[``.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling. Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]`` Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only All labels set to ``-1`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
**multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``: **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
...@@ -673,7 +571,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -673,7 +571,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1 mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, mc_token_ids) outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2] lm_prediction_scores, mc_prediction_scores = outputs[:2]
""" """
...@@ -684,7 +582,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -684,7 +582,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.multiple_choice_head = SequenceSummary(config) self.multiple_choice_head = SequenceSummary(config)
self.apply(self.init_weights) self.init_weights()
self.tie_weights() self.tie_weights()
def tie_weights(self): def tie_weights(self):
...@@ -694,9 +592,12 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -694,9 +592,12 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
self._tie_or_clone_weights(self.lm_head, self._tie_or_clone_weights(self.lm_head,
self.transformer.tokens_embed) self.transformer.tokens_embed)
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None, def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
position_ids=None, head_mask=None): mc_token_ids=None, lm_labels=None, mc_labels=None):
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask) head_mask=head_mask)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
......
...@@ -22,14 +22,11 @@ import logging ...@@ -22,14 +22,11 @@ import logging
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from pytorch_transformers.modeling_bert import (BertConfig, BertEmbeddings, from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
BertLayerNorm, BertModel, from .configuration_roberta import RobertaConfig
BertPreTrainedModel, gelu) from .file_utils import add_start_docstrings
from pytorch_transformers.modeling_utils import add_start_docstrings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -39,13 +36,6 @@ ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -39,13 +36,6 @@ ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin", 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
} }
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
}
class RobertaEmbeddings(BertEmbeddings): class RobertaEmbeddings(BertEmbeddings):
""" """
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
...@@ -61,11 +51,9 @@ class RobertaEmbeddings(BertEmbeddings): ...@@ -61,11 +51,9 @@ class RobertaEmbeddings(BertEmbeddings):
# cf. fairseq's `utils.make_positions` # cf. fairseq's `utils.make_positions`
position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device) position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids) position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
return super(RobertaEmbeddings, self).forward(input_ids, token_type_ids=token_type_ids, position_ids=position_ids) return super(RobertaEmbeddings, self).forward(input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids)
class RobertaConfig(BertConfig):
pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
...@@ -116,13 +104,20 @@ ROBERTA_INPUTS_DOCSTRING = r""" ...@@ -116,13 +104,20 @@ ROBERTA_INPUTS_DOCSTRING = r"""
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1[``.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Optional segment token indices to indicate first and second portions of the inputs.
This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
during finetuning.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1[``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
...@@ -168,14 +163,18 @@ class RobertaModel(BertModel): ...@@ -168,14 +163,18 @@ class RobertaModel(BertModel):
super(RobertaModel, self).__init__(config) super(RobertaModel, self).__init__(config)
self.embeddings = RobertaEmbeddings(config) self.embeddings = RobertaEmbeddings(config)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None): def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
if input_ids[:, 0].sum().item() != 0: if input_ids[:, 0].sum().item() != 0:
logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. " logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
"This model requires special tokens in order to work. " "This model requires special tokens in order to work. "
"Please specify add_special_tokens=True in your encoding.") "Please specify add_special_tokens=True in your encoding.")
return super(RobertaModel, self).forward(input_ids, token_type_ids, attention_mask, position_ids, head_mask) return super(RobertaModel, self).forward(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
...@@ -220,7 +219,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -220,7 +219,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
self.roberta = RobertaModel(config) self.roberta = RobertaModel(config)
self.lm_head = RobertaLMHead(config) self.lm_head = RobertaLMHead(config)
self.apply(self.init_weights) self.init_weights()
self.tie_weights() self.tie_weights()
def tie_weights(self): def tie_weights(self):
...@@ -229,10 +228,13 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -229,10 +228,13 @@ class RobertaForMaskedLM(BertPreTrainedModel):
""" """
self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings) self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, position_ids=None, def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
head_mask=None): masked_lm_labels=None):
outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, outputs = self.roberta(input_ids,
attention_mask=attention_mask, head_mask=head_mask) attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output) prediction_scores = self.lm_head(sequence_output)
...@@ -313,10 +315,13 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -313,10 +315,13 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
self.roberta = RobertaModel(config) self.roberta = RobertaModel(config)
self.classifier = RobertaClassificationHead(config) self.classifier = RobertaClassificationHead(config)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
position_ids=None, head_mask=None): labels=None):
outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, outputs = self.roberta(input_ids,
attention_mask=attention_mask, head_mask=head_mask) attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
......
...@@ -34,18 +34,16 @@ import torch.nn.functional as F ...@@ -34,18 +34,16 @@ import torch.nn.functional as F
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from .modeling_bert import BertLayerNorm as LayerNorm from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
from .configuration_transfo_xl import TransfoXLConfig
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings) from .file_utils import add_start_docstrings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = { TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin", 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
} }
TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
}
def build_tf_to_pytorch_map(model, config): def build_tf_to_pytorch_map(model, config):
""" A map of modules from TF to PyTorch. """ A map of modules from TF to PyTorch.
...@@ -175,143 +173,6 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -175,143 +173,6 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
return model return model
class TransfoXLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `TransfoXLModel`.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
cutoffs: cutoffs for the adaptive softmax
d_model: Dimensionality of the model's hidden states.
d_embed: Dimensionality of the embeddings
d_head: Dimensionality of the model's heads.
div_val: divident value for adapative input and softmax
pre_lnorm: apply LayerNorm to the input instead of the output
d_inner: Inner dimension in FF
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
tgt_len: number of tokens to predict
ext_len: length of the extended context
mem_len: length of the retained previous heads
same_length: use the same attn length for all tokens
proj_share_all_but_first: True to share all but first projs, False not to share.
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
clamp_len: use the same pos embeddings after clamp_len
sample_softmax: number of samples in sampled softmax
adaptive: use adaptive softmax
tie_weight: tie the word embedding and softmax weights
dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
dropatt: The dropout ratio for the attention probabilities.
untie_r: untie relative position biases
embd_pdrop: The dropout ratio for the embeddings.
init: parameter initializer to use
init_range: parameters initialized by U(-init_range, init_range).
proj_init_std: parameters initialized by N(0, init_std)
init_std: parameters initialized by N(0, init_std)
"""
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self,
vocab_size_or_config_json_file=267735,
cutoffs=[20000, 40000, 200000],
d_model=1024,
d_embed=1024,
n_head=16,
d_head=64,
d_inner=4096,
div_val=4,
pre_lnorm=False,
n_layer=18,
tgt_len=128,
ext_len=0,
mem_len=1600,
clamp_len=1000,
same_length=True,
proj_share_all_but_first=True,
attn_type=0,
sample_softmax=-1,
adaptive=True,
tie_weight=True,
dropout=0.1,
dropatt=0.0,
untie_r=True,
init="normal",
init_range=0.01,
proj_init_std=0.01,
init_std=0.02,
**kwargs):
"""Constructs TransfoXLConfig.
"""
super(TransfoXLConfig, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int):
self.n_token = vocab_size_or_config_json_file
self.cutoffs = []
self.cutoffs.extend(cutoffs)
self.tie_weight = tie_weight
if proj_share_all_but_first:
self.tie_projs = [False] + [True] * len(self.cutoffs)
else:
self.tie_projs = [False] + [False] * len(self.cutoffs)
self.d_model = d_model
self.d_embed = d_embed
self.d_head = d_head
self.d_inner = d_inner
self.div_val = div_val
self.pre_lnorm = pre_lnorm
self.n_layer = n_layer
self.n_head = n_head
self.tgt_len = tgt_len
self.ext_len = ext_len
self.mem_len = mem_len
self.same_length = same_length
self.attn_type = attn_type
self.clamp_len = clamp_len
self.sample_softmax = sample_softmax
self.adaptive = adaptive
self.dropout = dropout
self.dropatt = dropatt
self.untie_r = untie_r
self.init = init
self.init_range = init_range
self.proj_init_std = proj_init_std
self.init_std = init_std
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property
def max_position_embeddings(self):
return self.tgt_len + self.ext_len + self.mem_len
@property
def vocab_size(self):
return self.n_token
@vocab_size.setter
def vocab_size(self, value):
self.n_token = value
@property
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer
class PositionalEmbedding(nn.Module): class PositionalEmbedding(nn.Module):
def __init__(self, demb): def __init__(self, demb):
super(PositionalEmbedding, self).__init__() super(PositionalEmbedding, self).__init__()
...@@ -347,7 +208,7 @@ class PositionwiseFF(nn.Module): ...@@ -347,7 +208,7 @@ class PositionwiseFF(nn.Module):
nn.Dropout(dropout), nn.Dropout(dropout),
) )
self.layer_norm = LayerNorm(d_model) self.layer_norm = nn.LayerNorm(d_model)
self.pre_lnorm = pre_lnorm self.pre_lnorm = pre_lnorm
...@@ -370,7 +231,7 @@ class PositionwiseFF(nn.Module): ...@@ -370,7 +231,7 @@ class PositionwiseFF(nn.Module):
class MultiHeadAttn(nn.Module): class MultiHeadAttn(nn.Module):
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False): pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False):
super(MultiHeadAttn, self).__init__() super(MultiHeadAttn, self).__init__()
...@@ -387,7 +248,7 @@ class MultiHeadAttn(nn.Module): ...@@ -387,7 +248,7 @@ class MultiHeadAttn(nn.Module):
self.dropatt = nn.Dropout(dropatt) self.dropatt = nn.Dropout(dropatt)
self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
self.layer_norm = LayerNorm(d_model) self.layer_norm = nn.LayerNorm(d_model)
self.scale = 1 / (d_head ** 0.5) self.scale = 1 / (d_head ** 0.5)
...@@ -423,7 +284,8 @@ class MultiHeadAttn(nn.Module): ...@@ -423,7 +284,8 @@ class MultiHeadAttn(nn.Module):
# [qlen x klen x bsz x n_head] # [qlen x klen x bsz x n_head]
attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k)) attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))
attn_score.mul_(self.scale) attn_score.mul_(self.scale)
if attn_mask is not None and attn_mask.any().item(): if attn_mask is not None and torch.sum(attn_mask).item():
attn_mask = (attn_mask == 1) # Switch to bool
if attn_mask.dim() == 2: if attn_mask.dim() == 2:
attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf')) attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
elif attn_mask.dim() == 3: elif attn_mask.dim() == 3:
...@@ -476,7 +338,7 @@ class RelMultiHeadAttn(nn.Module): ...@@ -476,7 +338,7 @@ class RelMultiHeadAttn(nn.Module):
self.dropatt = nn.Dropout(dropatt) self.dropatt = nn.Dropout(dropatt)
self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
self.layer_norm = LayerNorm(d_model) self.layer_norm = nn.LayerNorm(d_model)
self.scale = 1 / (d_head ** 0.5) self.scale = 1 / (d_head ** 0.5)
...@@ -586,13 +448,22 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn): ...@@ -586,13 +448,22 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
attn_score.mul_(self.scale) attn_score.mul_(self.scale)
#### compute attention probability #### compute attention probability
if attn_mask is not None and attn_mask.any().item(): if attn_mask is not None and torch.sum(attn_mask).item():
attn_mask = (attn_mask == 1) # Switch to bool
if attn_mask.dim() == 2: if attn_mask.dim() == 2:
attn_score = attn_score.float().masked_fill( if next(self.parameters()).dtype == torch.float16:
attn_mask[None,:,:,None], -1e30).type_as(attn_score) attn_score = attn_score.float().masked_fill(
attn_mask[None,:,:,None], -65000).type_as(attn_score)
else:
attn_score = attn_score.float().masked_fill(
attn_mask[None,:,:,None], -1e30).type_as(attn_score)
elif attn_mask.dim() == 3: elif attn_mask.dim() == 3:
attn_score = attn_score.float().masked_fill( if next(self.parameters()).dtype == torch.float16:
attn_mask[:,:,:,None], -1e30).type_as(attn_score) attn_score = attn_score.float().masked_fill(
attn_mask[:,:,:,None], -65000).type_as(attn_score)
else:
attn_score = attn_score.float().masked_fill(
attn_mask[:,:,:,None], -1e30).type_as(attn_score)
# [qlen x klen x bsz x n_head] # [qlen x klen x bsz x n_head]
attn_prob = F.softmax(attn_score, dim=1) attn_prob = F.softmax(attn_score, dim=1)
...@@ -680,7 +551,8 @@ class RelLearnableMultiHeadAttn(RelMultiHeadAttn): ...@@ -680,7 +551,8 @@ class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
attn_score.mul_(self.scale) attn_score.mul_(self.scale)
#### compute attention probability #### compute attention probability
if attn_mask is not None and attn_mask.any().item(): if attn_mask is not None and torch.sum(attn_mask).item():
attn_mask = (attn_mask == 1) # Switch to bool
if attn_mask.dim() == 2: if attn_mask.dim() == 2:
attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf')) attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
elif attn_mask.dim() == 3: elif attn_mask.dim() == 3:
...@@ -723,7 +595,7 @@ class DecoderLayer(nn.Module): ...@@ -723,7 +595,7 @@ class DecoderLayer(nn.Module):
super(DecoderLayer, self).__init__() super(DecoderLayer, self).__init__()
self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs) self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=kwargs.get('pre_lnorm')) pre_lnorm=kwargs.get('pre_lnorm'))
def forward(self, dec_inp, dec_attn_mask=None, mems=None, head_mask=None): def forward(self, dec_inp, dec_attn_mask=None, mems=None, head_mask=None):
...@@ -743,7 +615,7 @@ class RelLearnableDecoderLayer(nn.Module): ...@@ -743,7 +615,7 @@ class RelLearnableDecoderLayer(nn.Module):
self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout, self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,
**kwargs) **kwargs)
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=kwargs.get('pre_lnorm')) pre_lnorm=kwargs.get('pre_lnorm'))
def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None, head_mask=None): def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None, head_mask=None):
...@@ -764,7 +636,7 @@ class RelPartialLearnableDecoderLayer(nn.Module): ...@@ -764,7 +636,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model, self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
d_head, dropout, **kwargs) d_head, dropout, **kwargs)
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=kwargs.get('pre_lnorm')) pre_lnorm=kwargs.get('pre_lnorm'))
def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None): def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
...@@ -781,7 +653,7 @@ class RelPartialLearnableDecoderLayer(nn.Module): ...@@ -781,7 +653,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
class AdaptiveEmbedding(nn.Module): class AdaptiveEmbedding(nn.Module):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
sample_softmax=False): sample_softmax=False):
super(AdaptiveEmbedding, self).__init__() super(AdaptiveEmbedding, self).__init__()
...@@ -819,7 +691,7 @@ class AdaptiveEmbedding(nn.Module): ...@@ -819,7 +691,7 @@ class AdaptiveEmbedding(nn.Module):
else: else:
param = next(self.parameters()) param = next(self.parameters())
inp_flat = inp.view(-1) inp_flat = inp.view(-1)
emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], emb_flat = torch.zeros([inp_flat.size(0), self.d_proj],
dtype=param.dtype, device=param.device) dtype=param.dtype, device=param.device)
for i in range(len(self.cutoffs)): for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
...@@ -853,9 +725,6 @@ class TransfoXLPreTrainedModel(PreTrainedModel): ...@@ -853,9 +725,6 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_transfo_xl load_tf_weights = load_tf_weights_in_transfo_xl
base_model_prefix = "transformer" base_model_prefix = "transformer"
def __init__(self, *inputs, **kwargs):
super(TransfoXLPreTrainedModel, self).__init__(*inputs, **kwargs)
def _init_weight(self, weight): def _init_weight(self, weight):
if self.config.init == 'uniform': if self.config.init == 'uniform':
nn.init.uniform_(weight, -self.config.init_range, self.config.init_range) nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
...@@ -865,7 +734,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel): ...@@ -865,7 +734,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
def _init_bias(self, bias): def _init_bias(self, bias):
nn.init.constant_(bias, 0.0) nn.init.constant_(bias, 0.0)
def init_weights(self, m): def _init_weights(self, m):
""" Initialize the weights. """ Initialize the weights.
""" """
classname = m.__class__.__name__ classname = m.__class__.__name__
...@@ -991,7 +860,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -991,7 +860,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
self.n_head = config.n_head self.n_head = config.n_head
self.d_head = config.d_head self.d_head = config.d_head
self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
div_val=config.div_val) div_val=config.div_val)
self.drop = nn.Dropout(config.dropout) self.drop = nn.Dropout(config.dropout)
...@@ -1059,7 +928,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -1059,7 +928,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
self.r_emb = nn.Parameter(torch.FloatTensor( self.r_emb = nn.Parameter(torch.FloatTensor(
self.n_layer, self.max_klen, self.n_head, self.d_head)) self.n_layer, self.max_klen, self.n_head, self.d_head))
self.apply(self.init_weights) self.init_weights()
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
return self.word_emb return self.word_emb
...@@ -1135,22 +1004,22 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -1135,22 +1004,22 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
mlen = mems[0].size(0) if mems is not None else 0 mlen = mems[0].size(0) if mems is not None else 0
klen = mlen + qlen klen = mlen + qlen
if self.same_length: if self.same_length:
all_ones = word_emb.new_ones(qlen, klen) all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
mask_len = klen - self.mem_len mask_len = klen - self.mem_len
if mask_len > 0: if mask_len > 0:
mask_shift_len = qlen - mask_len mask_shift_len = qlen - mask_len
else: else:
mask_shift_len = qlen mask_shift_len = qlen
dec_attn_mask = (torch.triu(all_ones, 1+mlen) dec_attn_mask = (torch.triu(all_ones, 1+mlen)
+ torch.tril(all_ones, -mask_shift_len)).bool()[:, :, None] # -1 + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
else: else:
dec_attn_mask = torch.triu( dec_attn_mask = torch.triu(
word_emb.new_ones(qlen, klen), diagonal=1+mlen).bool()[:,:,None] word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
hids = [] hids = []
attentions = [] attentions = []
if self.attn_type == 0: # default if self.attn_type == 0: # default
pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device,
dtype=word_emb.dtype) dtype=word_emb.dtype)
if self.clamp_len > 0: if self.clamp_len > 0:
pos_seq.clamp_(max=self.clamp_len) pos_seq.clamp_(max=self.clamp_len)
...@@ -1304,9 +1173,9 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -1304,9 +1173,9 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
self.sampler = LogUniformSampler(config.n_token, config.sample_softmax) self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
# use adaptive softmax (including standard softmax) # use adaptive softmax (including standard softmax)
else: else:
self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
config.cutoffs, div_val=config.div_val) config.cutoffs, div_val=config.div_val)
self.apply(self.init_weights) self.init_weights()
self.tie_weights() self.tie_weights()
def tie_weights(self): def tie_weights(self):
...@@ -1342,7 +1211,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -1342,7 +1211,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
def init_mems(self, data): def init_mems(self, data):
return self.transformer.init_mems(data) return self.transformer.init_mems(data)
def forward(self, input_ids, labels=None, mems=None, head_mask=None): def forward(self, input_ids, mems=None, head_mask=None, labels=None):
bsz = input_ids.size(0) bsz = input_ids.size(0)
tgt_len = input_ids.size(1) tgt_len = input_ids.size(1)
......
...@@ -30,14 +30,11 @@ from torch import nn ...@@ -30,14 +30,11 @@ from torch import nn
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from torch.nn import functional as F from torch.nn import functional as F
from .file_utils import cached_path from .configuration_utils import PretrainedConfig
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"
TF_WEIGHTS_NAME = 'model.ckpt'
try: try:
from torch.nn import Identity from torch.nn import Identity
...@@ -52,194 +49,6 @@ except ImportError: ...@@ -52,194 +49,6 @@ except ImportError:
def forward(self, input): def forward(self, input):
return input return input
if not six.PY2:
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = ''.join(docstr) + fn.__doc__
return fn
return docstring_decorator
else:
# Not possible to update class docstrings on python2
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
return fn
return docstring_decorator
class PretrainedConfig(object):
r""" Base class for all configuration classes.
Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
Note:
A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
It only affects the model's configuration.
Class attributes (overridden by derived classes):
- ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
Parameters:
``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
``torchscript``: string, default `False`. Is the model used with Torchscript.
"""
pretrained_config_archive_map = {}
def __init__(self, **kwargs):
self.finetuning_task = kwargs.pop('finetuning_task', None)
self.num_labels = kwargs.pop('num_labels', 2)
self.output_attentions = kwargs.pop('output_attentions', False)
self.output_hidden_states = kwargs.pop('output_hidden_states', False)
self.torchscript = kwargs.pop('torchscript', False)
def save_pretrained(self, save_directory):
""" Save a configuration object to the directory `save_directory`, so that it
can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
"""
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
# If we save using the predefined names, we can load using `from_pretrained`
output_config_file = os.path.join(save_directory, CONFIG_NAME)
self.to_json_file(output_config_file)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
Parameters:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
return_unused_kwargs: (`optional`) bool:
- If False, then this function returns just the final configuration object.
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
Examples::
# We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
# derived class: BertConfig
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
assert config.output_attention == True
config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
foo=False, return_unused_kwargs=True)
assert config.output_attention == True
assert unused_kwargs == {'foo': False}
"""
cache_dir = kwargs.pop('cache_dir', None)
force_download = kwargs.pop('force_download', False)
proxies = kwargs.pop('proxies', None)
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
elif os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
else:
config_file = pretrained_model_name_or_path
# redirect to the cache, if necessary
try:
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError as e:
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
logger.error(
"Couldn't reach server at '{}' to download pretrained model configuration file.".format(
config_file))
else:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name_or_path,
', '.join(cls.pretrained_config_archive_map.keys()),
config_file))
raise e
if resolved_config_file == config_file:
logger.info("loading configuration file {}".format(config_file))
else:
logger.info("loading configuration file {} from cache at {}".format(
config_file, resolved_config_file))
# Load config
config = cls.from_json_file(resolved_config_file)
# Update config with kwargs if needed
to_remove = []
for key, value in kwargs.items():
if hasattr(config, key):
setattr(config, key, value)
to_remove.append(key)
for key in to_remove:
kwargs.pop(key, None)
logger.info("Model config %s", config)
if return_unused_kwargs:
return config, kwargs
else:
return config
@classmethod
def from_dict(cls, json_object):
"""Constructs a `Config` from a Python dictionary of parameters."""
config = cls(vocab_size_or_config_json_file=-1)
for key, value in json_object.items():
config.__dict__[key] = value
return config
@classmethod
def from_json_file(cls, json_file):
"""Constructs a `BertConfig` from a json file of parameters."""
with open(json_file, "r", encoding='utf-8') as reader:
text = reader.read()
return cls.from_dict(json.loads(text))
def __eq__(self, other):
return self.__dict__ == other.__dict__
def __repr__(self):
return str(self.to_json_string())
def to_dict(self):
"""Serializes this instance to a Python dictionary."""
output = copy.deepcopy(self.__dict__)
return output
def to_json_string(self):
"""Serializes this instance to a JSON string."""
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
def to_json_file(self, json_file_path):
""" Save this instance to a json file."""
with open(json_file_path, "w", encoding='utf-8') as writer:
writer.write(self.to_json_string())
class PreTrainedModel(nn.Module): class PreTrainedModel(nn.Module):
r""" Base class for all models. r""" Base class for all models.
...@@ -300,7 +109,7 @@ class PreTrainedModel(nn.Module): ...@@ -300,7 +109,7 @@ class PreTrainedModel(nn.Module):
new_embeddings.to(old_embeddings.weight.device) new_embeddings.to(old_embeddings.weight.device)
# initialize all new embeddings (in particular added tokens) # initialize all new embeddings (in particular added tokens)
self.init_weights(new_embeddings) self._init_weights(new_embeddings)
# Copy word embeddings from the previous weights # Copy word embeddings from the previous weights
num_tokens_to_copy = min(old_num_tokens, new_num_tokens) num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
...@@ -316,6 +125,14 @@ class PreTrainedModel(nn.Module): ...@@ -316,6 +125,14 @@ class PreTrainedModel(nn.Module):
else: else:
first_module.weight = second_module.weight first_module.weight = second_module.weight
if hasattr(first_module, 'bias') and first_module.bias is not None:
first_module.bias.data = torch.nn.functional.pad(
first_module.bias.data,
(0, first_module.weight.shape[0] - first_module.bias.shape[0]),
'constant',
0
)
def resize_token_embeddings(self, new_num_tokens=None): def resize_token_embeddings(self, new_num_tokens=None):
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
...@@ -323,7 +140,7 @@ class PreTrainedModel(nn.Module): ...@@ -323,7 +140,7 @@ class PreTrainedModel(nn.Module):
Arguments: Arguments:
new_num_tokens: (`optional`) int: new_num_tokens: (`optional`) int:
New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model. If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
Return: ``torch.nn.Embeddings`` Return: ``torch.nn.Embeddings``
...@@ -344,14 +161,30 @@ class PreTrainedModel(nn.Module): ...@@ -344,14 +161,30 @@ class PreTrainedModel(nn.Module):
return model_embeds return model_embeds
def init_weights(self):
""" Initialize and prunes weights if needed. """
# Initialize weights
self.apply(self._init_weights)
# Prune heads if needed
if self.config.pruned_heads:
self.prune_heads(self.config.pruned_heads)
def prune_heads(self, heads_to_prune): def prune_heads(self, heads_to_prune):
""" Prunes heads of the base model. """ Prunes heads of the base model.
Arguments: Arguments:
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
""" """
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
# save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
for layer, heads in heads_to_prune.items():
union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON
base_model._prune_heads(heads_to_prune) base_model._prune_heads(heads_to_prune)
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
...@@ -601,7 +434,10 @@ class PoolerStartLogits(nn.Module): ...@@ -601,7 +434,10 @@ class PoolerStartLogits(nn.Module):
x = self.dense(hidden_states).squeeze(-1) x = self.dense(hidden_states).squeeze(-1)
if p_mask is not None: if p_mask is not None:
x = x * (1 - p_mask) - 1e30 * p_mask if next(self.parameters()).dtype == torch.float16:
x = x * (1 - p_mask) - 65500 * p_mask
else:
x = x * (1 - p_mask) - 1e30 * p_mask
return x return x
......
...@@ -16,11 +16,8 @@ ...@@ -16,11 +16,8 @@
""" """
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging import logging
import math import math
import sys
from io import open
import itertools import itertools
import numpy as np import numpy as np
...@@ -30,8 +27,9 @@ from torch import nn ...@@ -30,8 +27,9 @@ from torch import nn
from torch.nn import functional as F from torch.nn import functional as F
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings, from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, SQuADHead
prune_linear_layer, SequenceSummary, SQuADHead) from .configuration_xlm import XLMConfig
from .file_utils import add_start_docstrings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -44,161 +42,9 @@ XLM_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -44,161 +42,9 @@ XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin", 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin", 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin", 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin",
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin",
} }
XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
}
class XLMConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `XLMModel`.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
d_model: Size of the encoder layers and the pooler layer.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
d_inner: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
ff_activation: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
untie_r: untie relative position biases
attn_type: 'bi' for XLM, 'uni' for Transformer-XL
dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
dropatt: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm.
dropout: float, dropout rate.
dropatt: float, dropout rate on attention probabilities.
init: str, the initialization scheme, either "normal" or "uniform".
init_range: float, initialize the parameters with a uniform distribution
in [-init_range, init_range]. Only effective when init="uniform".
init_std: float, initialize the parameters with a normal distribution
with mean 0 and stddev init_std. Only effective when init="normal".
mem_len: int, the number of tokens to cache.
reuse_len: int, the number of tokens in the currect batch to be cached
and reused in the future.
bi_data: bool, whether to use bidirectional input pipeline.
Usually set to True during pretraining and False during finetuning.
clamp_len: int, clamp all relative distances larger than clamp_len.
-1 means no clamping.
same_length: bool, whether to use the same attention length for each token.
"""
pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self,
vocab_size_or_config_json_file=30145,
emb_dim=2048,
n_layers=12,
n_heads=16,
dropout=0.1,
attention_dropout=0.1,
gelu_activation=True,
sinusoidal_embeddings=False,
causal=False,
asm=False,
n_langs=1,
max_position_embeddings=512,
embed_init_std=2048 ** -0.5,
layer_norm_eps=1e-12,
init_std=0.02,
bos_index=0,
eos_index=1,
pad_index=2,
unk_index=3,
mask_index=5,
is_encoder=True,
finetuning_task=None,
num_labels=2,
summary_type='first',
summary_use_proj=True,
summary_activation=None,
summary_proj_to_labels=True,
summary_first_dropout=0.1,
start_n_top=5,
end_n_top=5,
**kwargs):
"""Constructs XLMConfig.
"""
super(XLMConfig, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int):
self.n_words = vocab_size_or_config_json_file
self.emb_dim = emb_dim
self.n_layers = n_layers
self.n_heads = n_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.gelu_activation = gelu_activation
self.sinusoidal_embeddings = sinusoidal_embeddings
self.causal = causal
self.asm = asm
self.n_langs = n_langs
self.layer_norm_eps = layer_norm_eps
self.bos_index = bos_index
self.eos_index = eos_index
self.pad_index = pad_index
self.unk_index = unk_index
self.mask_index = mask_index
self.is_encoder = is_encoder
self.max_position_embeddings = max_position_embeddings
self.embed_init_std = embed_init_std
self.init_std = init_std
self.finetuning_task = finetuning_task
self.num_labels = num_labels
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_proj_to_labels = summary_proj_to_labels
self.summary_first_dropout = summary_first_dropout
self.start_n_top = start_n_top
self.end_n_top = end_n_top
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property
def vocab_size(self):
return self.n_words
@vocab_size.setter
def vocab_size(self, value):
self.n_words = value
@property
def hidden_size(self):
return self.emb_dim
@property
def num_attention_heads(self):
return self.n_heads
@property
def num_hidden_layers(self):
return self.n_layers
def create_sinusoidal_embeddings(n_pos, dim, out): def create_sinusoidal_embeddings(n_pos, dim, out):
...@@ -265,13 +111,16 @@ class MultiHeadAttention(nn.Module): ...@@ -265,13 +111,16 @@ class MultiHeadAttention(nn.Module):
self.k_lin = nn.Linear(dim, dim) self.k_lin = nn.Linear(dim, dim)
self.v_lin = nn.Linear(dim, dim) self.v_lin = nn.Linear(dim, dim)
self.out_lin = nn.Linear(dim, dim) self.out_lin = nn.Linear(dim, dim)
self.pruned_heads = set()
def prune_heads(self, heads): def prune_heads(self, heads):
attention_head_size = self.dim // self.n_heads attention_head_size = self.dim // self.n_heads
if len(heads) == 0: if len(heads) == 0:
return return
mask = torch.ones(self.n_heads, attention_head_size) mask = torch.ones(self.n_heads, attention_head_size)
heads = set(heads) - self.pruned_heads
for head in heads: for head in heads:
head -= sum(1 if h < head else 0 for h in self.pruned_heads)
mask[head] = 0 mask[head] = 0
mask = mask.view(-1).contiguous().eq(1) mask = mask.view(-1).contiguous().eq(1)
index = torch.arange(len(mask))[mask].long() index = torch.arange(len(mask))[mask].long()
...@@ -283,6 +132,7 @@ class MultiHeadAttention(nn.Module): ...@@ -283,6 +132,7 @@ class MultiHeadAttention(nn.Module):
# Update hyper params # Update hyper params
self.n_heads = self.n_heads - len(heads) self.n_heads = self.n_heads - len(heads)
self.dim = attention_head_size * self.n_heads self.dim = attention_head_size * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, input, mask, kv=None, cache=None, head_mask=None): def forward(self, input, mask, kv=None, cache=None, head_mask=None):
""" """
...@@ -377,7 +227,7 @@ class XLMPreTrainedModel(PreTrainedModel): ...@@ -377,7 +227,7 @@ class XLMPreTrainedModel(PreTrainedModel):
def __init__(self, *inputs, **kwargs): def __init__(self, *inputs, **kwargs):
super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs) super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
def init_weights(self, module): def _init_weights(self, module):
""" Initialize the weights. """ """ Initialize the weights. """
if isinstance(module, nn.Embedding): if isinstance(module, nn.Embedding):
if self.config is not None and self.config.embed_init_std is not None: if self.config is not None and self.config.embed_init_std is not None:
...@@ -431,23 +281,23 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -431,23 +281,23 @@ XLM_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`. Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings. Mask to avoid performing attention on padding token indices.
Selected in the range ``[0, config.max_position_embeddings - 1]``. Mask values selected in ``[0, 1]``:
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens to be used to indicate the language of each token in the input. A parallel sequence of tokens to be used to indicate the language of each token in the input.
Indices are languages ids which can be obtained from the language names by using two conversion mappings Indices are languages ids which can be obtained from the language names by using two conversion mappings
provided in the configuration of the model (only provided for multilingual models). provided in the configuration of the model (only provided for multilingual models).
More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str). the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices. A parallel sequence of tokens (can be used to indicate various portions of the inputs).
Mask values selected in ``[0, 1]``: The embeddings from these tokens will be summed with the respective token embeddings.
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: **lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Length of each sentence that can be used to avoid performing attention on padding token indices. Length of each sentence that can be used to avoid performing attention on padding token indices.
You can also use `attention_mask` for the same result (see above), kept here for compatbility. You can also use `attention_mask` for the same result (see above), kept here for compatbility.
...@@ -488,7 +338,7 @@ class XLMModel(XLMPreTrainedModel): ...@@ -488,7 +338,7 @@ class XLMModel(XLMPreTrainedModel):
""" """
ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output', ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output',
'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads', 'n_langs', 'use_lang_emb', 'n_words', 'dim', 'n_layers', 'n_heads',
'hidden_dim', 'dropout', 'attention_dropout', 'asm', 'hidden_dim', 'dropout', 'attention_dropout', 'asm',
'asm_cutoffs', 'asm_div_value'] 'asm_cutoffs', 'asm_div_value']
...@@ -507,6 +357,7 @@ class XLMModel(XLMPreTrainedModel): ...@@ -507,6 +357,7 @@ class XLMModel(XLMPreTrainedModel):
# dictionary / languages # dictionary / languages
self.n_langs = config.n_langs self.n_langs = config.n_langs
self.use_lang_emb = config.use_lang_emb
self.n_words = config.n_words self.n_words = config.n_words
self.eos_index = config.eos_index self.eos_index = config.eos_index
self.pad_index = config.pad_index self.pad_index = config.pad_index
...@@ -529,7 +380,7 @@ class XLMModel(XLMPreTrainedModel): ...@@ -529,7 +380,7 @@ class XLMModel(XLMPreTrainedModel):
self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim) self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
if config.sinusoidal_embeddings: if config.sinusoidal_embeddings:
create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight) create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
if config.n_langs > 1: if config.n_langs > 1 and config.use_lang_emb:
self.lang_embeddings = nn.Embedding(self.n_langs, self.dim) self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index) self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps) self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
...@@ -552,7 +403,14 @@ class XLMModel(XLMPreTrainedModel): ...@@ -552,7 +403,14 @@ class XLMModel(XLMPreTrainedModel):
self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config)) self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
self.apply(self.init_weights) if hasattr(config, "pruned_heads"):
pruned_heads = config.pruned_heads.copy().items()
config.pruned_heads = {}
for layer, heads in pruned_heads:
if self.attentions[int(layer)].n_heads == config.n_heads:
self.prune_heads({int(layer): list(map(int, heads))})
self.init_weights()
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens) self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
...@@ -566,8 +424,8 @@ class XLMModel(XLMPreTrainedModel): ...@@ -566,8 +424,8 @@ class XLMModel(XLMPreTrainedModel):
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.attentions[layer].prune_heads(heads) self.attentions[layer].prune_heads(heads)
def forward(self, input_ids, lengths=None, position_ids=None, langs=None, def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
token_type_ids=None, attention_mask=None, cache=None, head_mask=None): # src_enc=None, src_len=None, lengths=None, cache=None, head_mask=None): # removed: src_enc=None, src_len=None
if lengths is None: if lengths is None:
lengths = (input_ids != self.pad_index).sum(dim=1).long() lengths = (input_ids != self.pad_index).sum(dim=1).long()
# mask = input_ids != self.pad_index # mask = input_ids != self.pad_index
...@@ -628,7 +486,7 @@ class XLMModel(XLMPreTrainedModel): ...@@ -628,7 +486,7 @@ class XLMModel(XLMPreTrainedModel):
# embeddings # embeddings
tensor = self.embeddings(input_ids) tensor = self.embeddings(input_ids)
tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor) tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
if langs is not None: if langs is not None and self.use_lang_emb:
tensor = tensor + self.lang_embeddings(langs) tensor = tensor + self.lang_embeddings(langs)
if token_type_ids is not None: if token_type_ids is not None:
tensor = tensor + self.embeddings(token_type_ids) tensor = tensor + self.embeddings(token_type_ids)
...@@ -764,7 +622,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -764,7 +622,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
self.transformer = XLMModel(config) self.transformer = XLMModel(config)
self.pred_layer = XLMPredLayer(config) self.pred_layer = XLMPredLayer(config)
self.apply(self.init_weights) self.init_weights()
self.tie_weights() self.tie_weights()
def tie_weights(self): def tie_weights(self):
...@@ -772,11 +630,16 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -772,11 +630,16 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
""" """
self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings) self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None, def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
attention_mask=None, cache=None, labels=None, head_mask=None): lengths=None, cache=None, head_mask=None, labels=None):
transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids, transformer_outputs = self.transformer(input_ids,
token_type_ids=token_type_ids, langs=langs, attention_mask=attention_mask,
attention_mask=attention_mask, cache=cache, head_mask=head_mask) langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask)
output = transformer_outputs[0] output = transformer_outputs[0]
outputs = self.pred_layer(output, labels) outputs = self.pred_layer(output, labels)
...@@ -826,13 +689,18 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -826,13 +689,18 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
self.transformer = XLMModel(config) self.transformer = XLMModel(config)
self.sequence_summary = SequenceSummary(config) self.sequence_summary = SequenceSummary(config)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None, def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
attention_mask=None, cache=None, labels=None, head_mask=None): lengths=None, cache=None, head_mask=None, labels=None):
transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids, transformer_outputs = self.transformer(input_ids,
token_type_ids=token_type_ids, langs=langs, attention_mask=attention_mask,
attention_mask=attention_mask, cache=cache, head_mask=head_mask) langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask)
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.sequence_summary(output) logits = self.sequence_summary(output)
...@@ -904,14 +772,19 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -904,14 +772,19 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
self.transformer = XLMModel(config) self.transformer = XLMModel(config)
self.qa_outputs = SQuADHead(config) self.qa_outputs = SQuADHead(config)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None, def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
attention_mask=None, cache=None, start_positions=None, end_positions=None, lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None,
cls_index=None, is_impossible=None, p_mask=None, head_mask=None): is_impossible=None, cls_index=None, p_mask=None):
transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids, transformer_outputs = self.transformer(input_ids,
token_type_ids=token_type_ids, langs=langs, attention_mask=attention_mask,
attention_mask=attention_mask, cache=cache, head_mask=head_mask) langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask)
output = transformer_outputs[0] output = transformer_outputs[0]
......
...@@ -29,9 +29,9 @@ from torch import nn ...@@ -29,9 +29,9 @@ from torch import nn
from torch.nn import functional as F from torch.nn import functional as F
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits
SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, from .configuration_xlnet import XLNetConfig
add_start_docstrings) from .file_utils import add_start_docstrings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -40,10 +40,6 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -40,10 +40,6 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin", 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin",
'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin", 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
} }
XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
}
def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None): def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
...@@ -192,165 +188,11 @@ def swish(x): ...@@ -192,165 +188,11 @@ def swish(x):
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
class XLNetConfig(PretrainedConfig):
"""Configuration class to store the configuration of a ``XLNetModel``.
Args:
vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
d_model: Size of the encoder layers and the pooler layer.
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
d_inner: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
ff_activation: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
untie_r: untie relative position biases
attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
dropatt: The dropout ratio for the attention
probabilities.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm.
dropout: float, dropout rate.
dropatt: float, dropout rate on attention probabilities.
init: str, the initialization scheme, either "normal" or "uniform".
init_range: float, initialize the parameters with a uniform distribution
in [-init_range, init_range]. Only effective when init="uniform".
init_std: float, initialize the parameters with a normal distribution
with mean 0 and stddev init_std. Only effective when init="normal".
mem_len: int, the number of tokens to cache.
reuse_len: int, the number of tokens in the currect batch to be cached
and reused in the future.
bi_data: bool, whether to use bidirectional input pipeline.
Usually set to True during pretraining and False during finetuning.
clamp_len: int, clamp all relative distances larger than clamp_len.
-1 means no clamping.
same_length: bool, whether to use the same attention length for each token.
finetuning_task: name of the glue task on which the model was fine-tuned if any
"""
pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self,
vocab_size_or_config_json_file=32000,
d_model=1024,
n_layer=24,
n_head=16,
d_inner=4096,
ff_activation="gelu",
untie_r=True,
attn_type="bi",
initializer_range=0.02,
layer_norm_eps=1e-12,
dropout=0.1,
mem_len=None,
reuse_len=None,
bi_data=False,
clamp_len=-1,
same_length=False,
finetuning_task=None,
num_labels=2,
summary_type='last',
summary_use_proj=True,
summary_activation='tanh',
summary_last_dropout=0.1,
start_n_top=5,
end_n_top=5,
**kwargs):
"""Constructs XLNetConfig.
"""
super(XLNetConfig, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int):
self.n_token = vocab_size_or_config_json_file
self.d_model = d_model
self.n_layer = n_layer
self.n_head = n_head
assert d_model % n_head == 0
self.d_head = d_model // n_head
self.ff_activation = ff_activation
self.d_inner = d_inner
self.untie_r = untie_r
self.attn_type = attn_type
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.dropout = dropout
self.mem_len = mem_len
self.reuse_len = reuse_len
self.bi_data = bi_data
self.clamp_len = clamp_len
self.same_length = same_length
self.finetuning_task = finetuning_task
self.num_labels = num_labels
self.summary_type = summary_type
self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation
self.summary_last_dropout = summary_last_dropout
self.start_n_top = start_n_top
self.end_n_top = end_n_top
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property
def max_position_embeddings(self):
return -1
@property
def vocab_size(self):
return self.n_token
@vocab_size.setter
def vocab_size(self, value):
self.n_token = value
@property
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return self.n_layer
try: try:
from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
except (ImportError, AttributeError) as e: except (ImportError, AttributeError) as e:
logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .") logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
class XLNetLayerNorm(nn.Module): from torch.nn import LayerNorm as XLNetLayerNorm
def __init__(self, d_model, eps=1e-12):
"""Construct a layernorm module in the TF style (epsilon inside the square root).
"""
super(XLNetLayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(d_model))
self.bias = nn.Parameter(torch.zeros(d_model))
self.variance_epsilon = eps
def forward(self, x):
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias
class XLNetRelativeAttention(nn.Module): class XLNetRelativeAttention(nn.Module):
def __init__(self, config): def __init__(self, config):
...@@ -418,7 +260,10 @@ class XLNetRelativeAttention(nn.Module): ...@@ -418,7 +260,10 @@ class XLNetRelativeAttention(nn.Module):
attn_score = (ac + bd + ef) * self.scale attn_score = (ac + bd + ef) * self.scale
if attn_mask is not None: if attn_mask is not None:
# attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
attn_score = attn_score - 1e30 * attn_mask if attn_mask.dtype == torch.float16:
attn_score = attn_score - 65500 * attn_mask
else:
attn_score = attn_score - 1e30 * attn_mask
# attention probability # attention probability
attn_prob = F.softmax(attn_score, dim=1) attn_prob = F.softmax(attn_score, dim=1)
...@@ -596,10 +441,7 @@ class XLNetPreTrainedModel(PreTrainedModel): ...@@ -596,10 +441,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
load_tf_weights = load_tf_weights_in_xlnet load_tf_weights = load_tf_weights_in_xlnet
base_model_prefix = "transformer" base_model_prefix = "transformer"
def __init__(self, *inputs, **kwargs): def _init_weights(self, module):
super(XLNetPreTrainedModel, self).__init__(*inputs, **kwargs)
def init_weights(self, module):
""" Initialize the weights. """ Initialize the weights.
""" """
if isinstance(module, (nn.Linear, nn.Embedding)): if isinstance(module, (nn.Linear, nn.Embedding)):
...@@ -662,19 +504,14 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -662,19 +504,14 @@ XLNET_INPUTS_DOCSTRING = r"""
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs). A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings. The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). the important thing is that they should be different for tokens which belong to different segments.
The model will compute relative segment differences from the given type indices:
0 if the segment id of two tokens are the same, 1 if not.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
Kept for compatibility with the original code base.
You can only uses one of `input_mask` and `attention_mask`
Mask values selected in ``[0, 1]``:
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
**mems**: (`optional`) **mems**: (`optional`)
list of ``torch.FloatTensor`` (one for each layer): list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
...@@ -692,6 +529,17 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -692,6 +529,17 @@ XLNET_INPUTS_DOCSTRING = r"""
Mask to indicate the output tokens to use. Mask to indicate the output tokens to use.
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token. If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
Only used during pretraining for partial prediction or for sequential decoding (generation). Only used during pretraining for partial prediction or for sequential decoding (generation).
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
Kept for compatibility with the original code base.
You can only uses one of `input_mask` and `attention_mask`
Mask values selected in ``[0, 1]``:
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
...@@ -746,7 +594,7 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -746,7 +594,7 @@ class XLNetModel(XLNetPreTrainedModel):
self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)]) self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
self.dropout = nn.Dropout(config.dropout) self.dropout = nn.Dropout(config.dropout)
self.apply(self.init_weights) self.init_weights()
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens) self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
...@@ -850,8 +698,8 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -850,8 +698,8 @@ class XLNetModel(XLNetPreTrainedModel):
pos_emb = pos_emb.to(next(self.parameters())) pos_emb = pos_emb.to(next(self.parameters()))
return pos_emb return pos_emb
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None, def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
mems=None, perm_mask=None, target_mapping=None, head_mask=None): token_type_ids=None, input_mask=None, head_mask=None):
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension # but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end # so we move here the first dimension (batch) to the end
...@@ -1047,7 +895,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1047,7 +895,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
self.transformer = XLNetModel(config) self.transformer = XLNetModel(config)
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True) self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
self.apply(self.init_weights) self.init_weights()
self.tie_weights() self.tie_weights()
def tie_weights(self): def tie_weights(self):
...@@ -1055,12 +903,15 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1055,12 +903,15 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
""" """
self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding) self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None, def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, labels=None):
labels=None, head_mask=None): transformer_outputs = self.transformer(input_ids,
transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
input_mask=input_mask, attention_mask=attention_mask, mems=mems,
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask) head_mask=head_mask)
logits = self.lm_loss(transformer_outputs[0]) logits = self.lm_loss(transformer_outputs[0])
...@@ -1124,14 +975,17 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1124,14 +975,17 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
self.sequence_summary = SequenceSummary(config) self.sequence_summary = SequenceSummary(config)
self.logits_proj = nn.Linear(config.d_model, config.num_labels) self.logits_proj = nn.Linear(config.d_model, config.num_labels)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None, def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, labels=None):
labels=None, head_mask=None): transformer_outputs = self.transformer(input_ids,
transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
input_mask=input_mask, attention_mask=attention_mask, mems=mems,
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask) head_mask=head_mask)
output = transformer_outputs[0] output = transformer_outputs[0]
...@@ -1317,15 +1171,18 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1317,15 +1171,18 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
self.end_logits = PoolerEndLogits(config) self.end_logits = PoolerEndLogits(config)
self.answer_class = PoolerAnswerClass(config) self.answer_class = PoolerAnswerClass(config)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None, def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None,
start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None, start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,):
head_mask=None): transformer_outputs = self.transformer(input_ids,
transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
input_mask=input_mask, attention_mask=attention_mask, mems=mems,
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask) head_mask=head_mask)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
start_logits = self.start_logits(hidden_states, p_mask=p_mask) start_logits = self.start_logits(hidden_states, p_mask=p_mask)
......
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import os
import shutil
import json
import random
import uuid
import unittest
import logging
class ConfigTester(object):
def __init__(self, parent, config_class=None, **kwargs):
self.parent = parent
self.config_class = config_class
self.inputs_dict = kwargs
def create_and_test_config_common_properties(self):
config = self.config_class(**self.inputs_dict)
self.parent.assertTrue(hasattr(config, 'vocab_size'))
self.parent.assertTrue(hasattr(config, 'hidden_size'))
self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
def create_and_test_config_to_json_string(self):
config = self.config_class(**self.inputs_dict)
obj = json.loads(config.to_json_string())
for key, value in self.inputs_dict.items():
self.parent.assertEqual(obj[key], value)
def create_and_test_config_to_json_file(self):
config_first = self.config_class(**self.inputs_dict)
json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
config_first.to_json_file(json_file_path)
config_second = self.config_class.from_json_file(json_file_path)
os.remove(json_file_path)
self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
def run_common_tests(self):
self.create_and_test_config_common_properties()
self.create_and_test_config_to_json_string()
self.create_and_test_config_to_json_file()
if __name__ == "__main__":
unittest.main()
\ No newline at end of file
...@@ -21,10 +21,15 @@ import shutil ...@@ -21,10 +21,15 @@ import shutil
import pytest import pytest
import logging import logging
from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel from pytorch_transformers import (AutoConfig, BertConfig,
AutoModel, BertModel,
AutoModelWithLMHead, BertForMaskedLM,
AutoModelForSequenceClassification, BertForSequenceClassification,
AutoModelForQuestionAnswering, BertForQuestionAnswering)
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
class AutoModelTest(unittest.TestCase): class AutoModelTest(unittest.TestCase):
...@@ -42,6 +47,42 @@ class AutoModelTest(unittest.TestCase): ...@@ -42,6 +47,42 @@ class AutoModelTest(unittest.TestCase):
for value in loading_info.values(): for value in loading_info.values():
self.assertEqual(len(value), 0) self.assertEqual(len(value), 0)
def test_lmhead_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelWithLMHead.from_pretrained(model_name)
model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForMaskedLM)
def test_sequence_classification_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForSequenceClassification)
def test_question_answering_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForQuestionAnswering)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment