Unverified Commit 54abc67a authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #2255 from aaugustin/implement-best-practices

Implement some Python best practices
parents 645713e2 c11b3e29
......@@ -14,8 +14,7 @@
# limitations under the License.
""" Configuration base class and utilities."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import copy
import json
......@@ -24,9 +23,15 @@ import os
from io import open
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, WEIGHTS_NAME, TF2_WEIGHTS_NAME, \
cached_path, is_remote_url, hf_bucket_url
from .file_utils import (
CONFIG_NAME,
MODEL_CARD_NAME,
TF2_WEIGHTS_NAME,
WEIGHTS_NAME,
cached_path,
hf_bucket_url,
is_remote_url,
)
logger = logging.getLogger(__name__)
......@@ -48,17 +53,18 @@ class ModelCard(object):
Parameters:
"""
def __init__(self, **kwargs):
# Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
self.model_details = kwargs.pop('model_details', {})
self.intended_use = kwargs.pop('intended_use', {})
self.factors = kwargs.pop('factors', {})
self.metrics = kwargs.pop('metrics', {})
self.evaluation_data = kwargs.pop('evaluation_data', {})
self.training_data = kwargs.pop('training_data', {})
self.quantitative_analyses = kwargs.pop('quantitative_analyses', {})
self.ethical_considerations = kwargs.pop('ethical_considerations', {})
self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {})
self.model_details = kwargs.pop("model_details", {})
self.intended_use = kwargs.pop("intended_use", {})
self.factors = kwargs.pop("factors", {})
self.metrics = kwargs.pop("metrics", {})
self.evaluation_data = kwargs.pop("evaluation_data", {})
self.training_data = kwargs.pop("training_data", {})
self.quantitative_analyses = kwargs.pop("quantitative_analyses", {})
self.ethical_considerations = kwargs.pop("ethical_considerations", {})
self.caveats_and_recommendations = kwargs.pop("caveats_and_recommendations", {})
# Open additional attributes
for key, value in kwargs.items():
......@@ -122,10 +128,10 @@ class ModelCard(object):
modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
"""
cache_dir = kwargs.pop('cache_dir', None)
proxies = kwargs.pop('proxies', None)
find_from_standard_name = kwargs.pop('find_from_standard_name', True)
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
cache_dir = kwargs.pop("cache_dir", None)
proxies = kwargs.pop("proxies", None)
find_from_standard_name = kwargs.pop("find_from_standard_name", True)
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
# For simplicity we use the same pretrained url than the configuration files
......@@ -145,36 +151,43 @@ class ModelCard(object):
try:
# Load from URL or cache if already cached
resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=True,
proxies=proxies, resume_download=False)
resolved_model_card_file = cached_path(
model_card_file, cache_dir=cache_dir, force_download=True, proxies=proxies, resume_download=False
)
if resolved_model_card_file == model_card_file:
logger.info("loading model card file {}".format(model_card_file))
else:
logger.info("loading model card file {} from cache at {}".format(
model_card_file, resolved_model_card_file))
logger.info(
"loading model card file {} from cache at {}".format(model_card_file, resolved_model_card_file)
)
# Load model card
modelcard = cls.from_json_file(resolved_model_card_file)
except EnvironmentError:
if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
logger.warning("Couldn't reach server at '{}' to download model card file.".format(
model_card_file))
logger.warning("Couldn't reach server at '{}' to download model card file.".format(model_card_file))
else:
logger.warning("Model name '{}' was not found in model name list ({}). " \
"We assumed '{}' was a path or url to a model card file named {} or " \
logger.warning(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url to a model card file named {} or "
"a directory containing such a file but couldn't find any such file at this path or url.".format(
pretrained_model_name_or_path,
', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
model_card_file, MODEL_CARD_NAME))
", ".join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
model_card_file,
MODEL_CARD_NAME,
)
)
logger.warning("Creating an empty model card.")
# We fall back on creating an empty model card
modelcard = cls()
except json.JSONDecodeError:
logger.warning("Couldn't reach server at '{}' to download model card file or "
logger.warning(
"Couldn't reach server at '{}' to download model card file or "
"model card file is not a valid JSON file. "
"Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file))
"Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file)
)
logger.warning("Creating an empty model card.")
# We fall back on creating an empty model card
......@@ -203,7 +216,7 @@ class ModelCard(object):
@classmethod
def from_json_file(cls, json_file):
"""Constructs a `ModelCard` from a json file of parameters."""
with open(json_file, "r", encoding='utf-8') as reader:
with open(json_file, "r", encoding="utf-8") as reader:
text = reader.read()
dict_obj = json.loads(text)
return cls(**dict_obj)
......@@ -225,5 +238,5 @@ class ModelCard(object):
def to_json_file(self, json_file_path):
""" Save this instance to a json file."""
with open(json_file_path, "w", encoding='utf-8') as writer:
with open(json_file_path, "w", encoding="utf-8") as writer:
writer.write(self.to_json_string())
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
#
......@@ -15,29 +14,33 @@
# limitations under the License.
"""PyTorch ALBERT model. """
import os
import math
import logging
import math
import os
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from transformers.modeling_utils import PreTrainedModel
from transformers.configuration_albert import AlbertConfig
from transformers.modeling_bert import BertEmbeddings, BertSelfAttention, prune_linear_layer, ACT2FN
from transformers.modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer
from transformers.modeling_utils import PreTrainedModel
from .file_utils import add_start_docstrings
logger = logging.getLogger(__name__)
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
"albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
"albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
"albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
"albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
"albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
"albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
"albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
"albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
}
......@@ -48,8 +51,10 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
import numpy as np
import tensorflow as tf
except ImportError:
logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.")
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
......@@ -109,7 +114,7 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
if "seq_relationship" in name:
continue
name = name.split('/')
name = name.split("/")
# Ignore the gradients applied by the LAMB/ADAM optimizers.
if "adam_m" in name or "adam_v" in name or "global_step" in name:
......@@ -118,32 +123,32 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
pointer = model
for m_name in name:
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
l = re.split(r'_(\d+)', m_name)
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
l = [m_name]
if l[0] == 'kernel' or l[0] == 'gamma':
pointer = getattr(pointer, 'weight')
elif l[0] == 'output_bias' or l[0] == 'beta':
pointer = getattr(pointer, 'bias')
elif l[0] == 'output_weights':
pointer = getattr(pointer, 'weight')
elif l[0] == 'squad':
pointer = getattr(pointer, 'classifier')
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, l[0])
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info("Skipping {}".format("/".join(name)))
continue
if len(l) >= 2:
num = int(l[1])
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name[-11:] == '_embeddings':
pointer = getattr(pointer, 'weight')
elif m_name == 'kernel':
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
assert pointer.shape == array.shape
......@@ -160,6 +165,7 @@ class AlbertEmbeddings(BertEmbeddings):
"""
Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config):
super(AlbertEmbeddings, self).__init__(config)
......@@ -238,9 +244,12 @@ class AlbertAttention(BertSelfAttention):
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
reshaped_context_layer = context_layer.view(*new_context_layer_shape)
# Should find a better way to do this
w = self.dense.weight.t().view(self.num_attention_heads, self.attention_head_size, self.hidden_size).to(context_layer.dtype)
w = (
self.dense.weight.t()
.view(self.num_attention_heads, self.attention_head_size, self.hidden_size)
.to(context_layer.dtype)
)
b = self.dense.bias.to(context_layer.dtype)
projected_context_layer = torch.einsum("bfnd,ndh->bfh", context_layer, w) + b
......@@ -328,7 +337,11 @@ class AlbertTransformer(nn.Module):
# Index of the layer inside the group
layer_idx = int(i - group_idx * layers_per_group)
layer_group_output = self.albert_layer_groups[group_idx](hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group])
layer_group_output = self.albert_layer_groups[group_idx](
hidden_states,
attention_mask,
head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
)
hidden_states = layer_group_output[0]
if self.output_attentions:
......@@ -337,7 +350,6 @@ class AlbertTransformer(nn.Module):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if self.output_hidden_states:
outputs = outputs + (all_hidden_states,)
......@@ -346,11 +358,11 @@ class AlbertTransformer(nn.Module):
return outputs # last-layer hidden state, (all hidden states), (all attentions)
class AlbertPreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = AlbertConfig
pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "albert"
......@@ -431,8 +443,12 @@ ALBERT_INPUTS_DOCSTRING = r"""
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
ALBERT_START_DOCSTRING,
ALBERT_INPUTS_DOCSTRING,
)
class AlbertModel(AlbertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -500,8 +516,15 @@ class AlbertModel(AlbertPreTrainedModel):
inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
inputs_embeds=None):
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
):
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
......@@ -527,24 +550,30 @@ class AlbertModel(AlbertPreTrainedModel):
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
head_mask = (
head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
) # We can specify head_mask for each layer
head_mask = head_mask.to(
dtype=next(self.parameters()).dtype
) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.num_hidden_layers
embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds)
encoder_outputs = self.encoder(embedding_output,
extended_attention_mask,
head_mask=head_mask)
embedding_output = self.embeddings(
input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
)
encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
outputs = (sequence_output, pooled_output) + encoder_outputs[1:] # add hidden_states and attentions if they are here
outputs = (sequence_output, pooled_output) + encoder_outputs[
1:
] # add hidden_states and attentions if they are here
return outputs
class AlbertMLMHead(nn.Module):
def __init__(self, config):
super(AlbertMLMHead, self).__init__()
......@@ -566,7 +595,9 @@ class AlbertMLMHead(nn.Module):
return prediction_scores
@add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING
)
class AlbertForMaskedLM(AlbertPreTrainedModel):
r"""
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -602,21 +633,28 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
self._tie_or_clone_weights(self.predictions.decoder,
self.albert.embeddings.word_embeddings)
self._tie_or_clone_weights(self.predictions.decoder, self.albert.embeddings.word_embeddings)
def get_output_embeddings(self):
return self.predictions.decoder
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
masked_lm_labels=None):
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
masked_lm_labels=None,
):
outputs = self.albert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds
inputs_embeds=inputs_embeds,
)
sequence_outputs = outputs[0]
......@@ -631,9 +669,12 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
return outputs
@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
@add_start_docstrings(
"""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
ALBERT_START_DOCSTRING,
ALBERT_INPUTS_DOCSTRING,
)
class AlbertForSequenceClassification(AlbertPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......@@ -665,6 +706,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
loss, logits = outputs[:2]
"""
def __init__(self, config):
super(AlbertForSequenceClassification, self).__init__(config)
self.num_labels = config.num_labels
......@@ -675,8 +717,16 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
outputs = self.albert(
input_ids=input_ids,
......@@ -684,7 +734,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds
inputs_embeds=inputs_embeds,
)
pooled_output = outputs[1]
......@@ -707,10 +757,12 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings("""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
@add_start_docstrings(
"""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
ALBERT_START_DOCSTRING,
ALBERT_INPUTS_DOCSTRING,
)
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
r"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......@@ -752,6 +804,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
"""
def __init__(self, config):
super(AlbertForQuestionAnswering, self).__init__(config)
self.num_labels = config.num_labels
......@@ -761,8 +814,17 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
inputs_embeds=None, start_positions=None, end_positions=None):
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
):
outputs = self.albert(
input_ids=input_ids,
......@@ -770,7 +832,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds
inputs_embeds=inputs_embeds,
)
sequence_output = outputs[0]
......
......@@ -18,40 +18,91 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import logging
from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRLConfig,
DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig,
TransfoXLConfig, XLMConfig, XLNetConfig, XLMRobertaConfig)
from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \
BertForTokenClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, \
XLNetForTokenClassification, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, \
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, \
RobertaForTokenClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, \
DistilBertForSequenceClassification, DistilBertForTokenClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, \
CamembertForMultipleChoice, CamembertForTokenClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, \
AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, \
XLMRobertaForMultipleChoice, XLMRobertaForTokenClassification, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_utils import PreTrainedModel, SequenceSummary
from .file_utils import add_start_docstrings
from .configuration_auto import (
AlbertConfig,
BertConfig,
CamembertConfig,
CTRLConfig,
DistilBertConfig,
GPT2Config,
OpenAIGPTConfig,
RobertaConfig,
TransfoXLConfig,
XLMConfig,
XLMRobertaConfig,
XLNetConfig,
)
from .modeling_albert import (
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
AlbertForMaskedLM,
AlbertForQuestionAnswering,
AlbertForSequenceClassification,
AlbertModel,
)
from .modeling_bert import (
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
BertForMaskedLM,
BertForQuestionAnswering,
BertForSequenceClassification,
BertForTokenClassification,
BertModel,
)
from .modeling_camembert import (
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
CamembertForMaskedLM,
CamembertForSequenceClassification,
CamembertForTokenClassification,
CamembertModel,
)
from .modeling_ctrl import CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel, CTRLModel
from .modeling_distilbert import (
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
DistilBertForMaskedLM,
DistilBertForQuestionAnswering,
DistilBertForSequenceClassification,
DistilBertForTokenClassification,
DistilBertModel,
)
from .modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2LMHeadModel, GPT2Model
from .modeling_openai import OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTLMHeadModel, OpenAIGPTModel
from .modeling_roberta import (
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
RobertaForMaskedLM,
RobertaForSequenceClassification,
RobertaForTokenClassification,
RobertaModel,
)
from .modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5Model, T5WithLMHeadModel
from .modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TransfoXLLMHeadModel, TransfoXLModel
from .modeling_xlm import (
XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
XLMForQuestionAnswering,
XLMForSequenceClassification,
XLMModel,
XLMWithLMHeadModel,
)
from .modeling_xlm_roberta import (
XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
XLMRobertaForMaskedLM,
XLMRobertaForSequenceClassification,
XLMRobertaForTokenClassification,
XLMRobertaModel,
)
from .modeling_xlnet import (
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
XLNetForQuestionAnswering,
XLNetForSequenceClassification,
XLNetForTokenClassification,
XLNetLMHeadModel,
XLNetModel,
)
logger = logging.getLogger(__name__)
ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict(
(key, value)
for pretrained_map in [
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
......@@ -67,7 +118,8 @@ ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
T5_PRETRAINED_MODEL_ARCHIVE_MAP,
XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
]
for key, value, in pretrained_map.items())
for key, value, in pretrained_map.items()
)
class AutoModel(object):
......@@ -98,10 +150,13 @@ class AutoModel(object):
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError("AutoModel is designed to be instantiated "
raise EnvironmentError(
"AutoModel is designed to be instantiated "
"using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or "
"`AutoModel.from_config(config)` methods.")
"`AutoModel.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
......@@ -232,35 +287,39 @@ class AutoModel(object):
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
if 't5' in pretrained_model_name_or_path:
if "t5" in pretrained_model_name_or_path:
return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
elif "distilbert" in pretrained_model_name_or_path:
return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'albert' in pretrained_model_name_or_path:
elif "albert" in pretrained_model_name_or_path:
return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'camembert' in pretrained_model_name_or_path:
elif "camembert" in pretrained_model_name_or_path:
return CamembertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm-roberta' in pretrained_model_name_or_path:
elif "xlm-roberta" in pretrained_model_name_or_path:
return XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
elif "roberta" in pretrained_model_name_or_path:
return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
elif "bert" in pretrained_model_name_or_path:
return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'openai-gpt' in pretrained_model_name_or_path:
elif "openai-gpt" in pretrained_model_name_or_path:
return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'gpt2' in pretrained_model_name_or_path:
elif "gpt2" in pretrained_model_name_or_path:
return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'transfo-xl' in pretrained_model_name_or_path:
elif "transfo-xl" in pretrained_model_name_or_path:
return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
elif "xlnet" in pretrained_model_name_or_path:
return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
elif "xlm" in pretrained_model_name_or_path:
return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'ctrl' in pretrained_model_name_or_path:
elif "ctrl" in pretrained_model_name_or_path:
return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
raise ValueError(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
"'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(
pretrained_model_name_or_path
)
)
class AutoModelWithLMHead(object):
......@@ -291,10 +350,13 @@ class AutoModelWithLMHead(object):
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
raise EnvironmentError(
"AutoModelWithLMHead is designed to be instantiated "
"using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
"`AutoModelWithLMHead.from_config(config)` methods.")
"`AutoModelWithLMHead.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
......@@ -423,35 +485,39 @@ class AutoModelWithLMHead(object):
model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
if 't5' in pretrained_model_name_or_path:
if "t5" in pretrained_model_name_or_path:
return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
elif "distilbert" in pretrained_model_name_or_path:
return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'albert' in pretrained_model_name_or_path:
elif "albert" in pretrained_model_name_or_path:
return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'camembert' in pretrained_model_name_or_path:
elif "camembert" in pretrained_model_name_or_path:
return CamembertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm-roberta' in pretrained_model_name_or_path:
elif "xlm-roberta" in pretrained_model_name_or_path:
return XLMRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
elif "roberta" in pretrained_model_name_or_path:
return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
elif "bert" in pretrained_model_name_or_path:
return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'openai-gpt' in pretrained_model_name_or_path:
elif "openai-gpt" in pretrained_model_name_or_path:
return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'gpt2' in pretrained_model_name_or_path:
elif "gpt2" in pretrained_model_name_or_path:
return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'transfo-xl' in pretrained_model_name_or_path:
elif "transfo-xl" in pretrained_model_name_or_path:
return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
elif "xlnet" in pretrained_model_name_or_path:
return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
elif "xlm" in pretrained_model_name_or_path:
return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'ctrl' in pretrained_model_name_or_path:
elif "ctrl" in pretrained_model_name_or_path:
return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
raise ValueError(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
"'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(
pretrained_model_name_or_path
)
)
class AutoModelForSequenceClassification(object):
......@@ -477,10 +543,13 @@ class AutoModelForSequenceClassification(object):
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError("AutoModelForSequenceClassification is designed to be instantiated "
raise EnvironmentError(
"AutoModelForSequenceClassification is designed to be instantiated "
"using the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
"`AutoModelForSequenceClassification.from_config(config)` methods.")
"`AutoModelForSequenceClassification.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
......@@ -597,25 +666,39 @@ class AutoModelForSequenceClassification(object):
model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
if 'distilbert' in pretrained_model_name_or_path:
return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'albert' in pretrained_model_name_or_path:
return AlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'camembert' in pretrained_model_name_or_path:
return CamembertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm-roberta' in pretrained_model_name_or_path:
return XLMRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
if "distilbert" in pretrained_model_name_or_path:
return DistilBertForSequenceClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "albert" in pretrained_model_name_or_path:
return AlbertForSequenceClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "camembert" in pretrained_model_name_or_path:
return CamembertForSequenceClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "xlm-roberta" in pretrained_model_name_or_path:
return XLMRobertaForSequenceClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "roberta" in pretrained_model_name_or_path:
return RobertaForSequenceClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "bert" in pretrained_model_name_or_path:
return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
elif "xlnet" in pretrained_model_name_or_path:
return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
elif "xlm" in pretrained_model_name_or_path:
return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
raise ValueError(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(
pretrained_model_name_or_path
)
)
class AutoModelForQuestionAnswering(object):
......@@ -638,10 +721,13 @@ class AutoModelForQuestionAnswering(object):
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError("AutoModelForQuestionAnswering is designed to be instantiated "
raise EnvironmentError(
"AutoModelForQuestionAnswering is designed to be instantiated "
"using the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
"`AutoModelForQuestionAnswering.from_config(config)` methods.")
"`AutoModelForQuestionAnswering.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
......@@ -745,26 +831,30 @@ class AutoModelForQuestionAnswering(object):
model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
if 'distilbert' in pretrained_model_name_or_path:
if "distilbert" in pretrained_model_name_or_path:
return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'albert' in pretrained_model_name_or_path:
elif "albert" in pretrained_model_name_or_path:
return AlbertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
elif "bert" in pretrained_model_name_or_path:
return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
elif "xlnet" in pretrained_model_name_or_path:
return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
elif "xlm" in pretrained_model_name_or_path:
return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path))
raise ValueError(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path)
)
class AutoModelForTokenClassification:
def __init__(self):
raise EnvironmentError("AutoModelForTokenClassification is designed to be instantiated "
raise EnvironmentError(
"AutoModelForTokenClassification is designed to be instantiated "
"using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
"`AutoModelForTokenClassification.from_config(config)` methods.")
"`AutoModelForTokenClassification.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
......@@ -870,18 +960,28 @@ class AutoModelForTokenClassification:
model = AutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
if 'camembert' in pretrained_model_name_or_path:
return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm-roberta' in pretrained_model_name_or_path:
return XLMRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
if "camembert" in pretrained_model_name_or_path:
return CamembertForTokenClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "distilbert" in pretrained_model_name_or_path:
return DistilBertForTokenClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "xlm-roberta" in pretrained_model_name_or_path:
return XLMRobertaForTokenClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "roberta" in pretrained_model_name_or_path:
return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
elif "bert" in pretrained_model_name_or_path:
return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
elif "xlnet" in pretrained_model_name_or_path:
return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(pretrained_model_name_or_path))
raise ValueError(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(
pretrained_model_name_or_path
)
)
......@@ -26,34 +26,35 @@ import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
from .modeling_utils import PreTrainedModel, prune_linear_layer
from .configuration_bert import BertConfig
from .file_utils import add_start_docstrings
from .modeling_utils import PreTrainedModel, prune_linear_layer
logger = logging.getLogger(__name__)
BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
"bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
"bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
"bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
"bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
"bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
"bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
"bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
"bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
"bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
"bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
"bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
"bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
"bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
"bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
"bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
"bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
"bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
"bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
"bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
"bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
"bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
}
......@@ -65,8 +66,10 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
import numpy as np
import tensorflow as tf
except ImportError:
logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.")
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
......@@ -81,7 +84,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
arrays.append(array)
for name, array in zip(names, arrays):
name = name.split('/')
name = name.split("/")
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
......@@ -89,30 +92,30 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
continue
pointer = model
for m_name in name:
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
l = re.split(r'_(\d+)', m_name)
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
l = [m_name]
if l[0] == 'kernel' or l[0] == 'gamma':
pointer = getattr(pointer, 'weight')
elif l[0] == 'output_bias' or l[0] == 'beta':
pointer = getattr(pointer, 'bias')
elif l[0] == 'output_weights':
pointer = getattr(pointer, 'weight')
elif l[0] == 'squad':
pointer = getattr(pointer, 'classifier')
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, l[0])
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info("Skipping {}".format("/".join(name)))
continue
if len(l) >= 2:
num = int(l[1])
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name[-11:] == '_embeddings':
pointer = getattr(pointer, 'weight')
elif m_name == 'kernel':
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
assert pointer.shape == array.shape
......@@ -157,6 +160,7 @@ BertLayerNorm = torch.nn.LayerNorm
class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
......@@ -199,7 +203,8 @@ class BertSelfAttention(nn.Module):
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
)
self.output_attentions = config.output_attentions
self.num_attention_heads = config.num_attention_heads
......@@ -217,7 +222,14 @@ class BertSelfAttention(nn.Module):
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
):
mixed_query_layer = self.query(hidden_states)
# If this is instantiated as a cross-attention module, the keys
......@@ -307,8 +319,17 @@ class BertAttention(nn.Module):
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
self_outputs = self.self(hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
):
self_outputs = self.self(
hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
......@@ -318,7 +339,9 @@ class BertIntermediate(nn.Module):
def __init__(self, config):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
if isinstance(config.hidden_act, str) or (
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
......@@ -353,13 +376,22 @@ class BertLayer(nn.Module):
self.intermediate = BertIntermediate(config)
self.output = BertOutput(config)
def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
):
self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
if self.is_decoder and encoder_hidden_states is not None:
cross_attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask)
cross_attention_outputs = self.crossattention(
attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights
......@@ -376,14 +408,23 @@ class BertEncoder(nn.Module):
self.output_hidden_states = config.output_hidden_states
self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
):
all_hidden_states = ()
all_attentions = ()
for i, layer_module in enumerate(self.layer):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask)
layer_outputs = layer_module(
hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask
)
hidden_states = layer_outputs[0]
if self.output_attentions:
......@@ -420,7 +461,9 @@ class BertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super(BertPredictionHeadTransform, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
if isinstance(config.hidden_act, str) or (
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
......@@ -440,9 +483,7 @@ class BertLMPredictionHead(nn.Module):
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = nn.Linear(config.hidden_size,
config.vocab_size,
bias=False)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
......@@ -488,6 +529,7 @@ class BertPreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = BertConfig
pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
load_tf_weights = load_tf_weights_in_bert
......@@ -581,8 +623,12 @@ BERT_INPUTS_DOCSTRING = r"""
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
"""
@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING,
)
class BertModel(BertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -612,6 +658,7 @@ class BertModel(BertPreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config):
super(BertModel, self).__init__(config)
self.config = config
......@@ -636,8 +683,17 @@ class BertModel(BertPreTrainedModel):
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None):
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
):
""" Forward pass on the Model.
The model can behave as an encoder (with only self-attention) as well
......@@ -681,12 +737,18 @@ class BertModel(BertPreTrainedModel):
batch_size, seq_length = input_shape
seq_ids = torch.arange(seq_length, device=device)
causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
causal_mask = causal_mask.to(torch.long) # not converting to long will cause errors with pytorch version < 1.3
causal_mask = causal_mask.to(
torch.long
) # not converting to long will cause errors with pytorch version < 1.3
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
else:
extended_attention_mask = attention_mask[:, None, None, :]
else:
raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape))
raise ValueError(
"Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
input_shape, attention_mask.shape
)
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
......@@ -709,10 +771,15 @@ class BertModel(BertPreTrainedModel):
elif encoder_attention_mask.dim() == 2:
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
else:
raise ValueError("Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(encoder_hidden_shape,
encoder_attention_mask.shape))
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
raise ValueError(
"Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(
encoder_hidden_shape, encoder_attention_mask.shape
)
)
encoder_extended_attention_mask = encoder_extended_attention_mask.to(
dtype=next(self.parameters()).dtype
) # fp16 compatibility
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
else:
encoder_extended_attention_mask = None
......@@ -727,28 +794,40 @@ class BertModel(BertPreTrainedModel):
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
head_mask = (
head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
) # We can specify head_mask for each layer
head_mask = head_mask.to(
dtype=next(self.parameters()).dtype
) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.num_hidden_layers
embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds)
encoder_outputs = self.encoder(embedding_output,
embedding_output = self.embeddings(
input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask)
encoder_attention_mask=encoder_extended_attention_mask,
)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output)
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here
outputs = (sequence_output, pooled_output,) + encoder_outputs[
1:
] # add hidden_states and attentions if they are here
return outputs # sequence_output, pooled_output, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
@add_start_docstrings(
"""Bert Model with two heads on top as done during the pre-training:
a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING)
BERT_INPUTS_DOCSTRING,
)
class BertForPreTraining(BertPreTrainedModel):
r"""
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -786,6 +865,7 @@ class BertForPreTraining(BertPreTrainedModel):
prediction_scores, seq_relationship_scores = outputs[:2]
"""
def __init__(self, config):
super(BertForPreTraining, self).__init__(config)
......@@ -797,20 +877,33 @@ class BertForPreTraining(BertPreTrainedModel):
def get_output_embeddings(self):
return self.cls.predictions.decoder
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
masked_lm_labels=None, next_sentence_label=None):
outputs = self.bert(input_ids,
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
masked_lm_labels=None,
next_sentence_label=None,
):
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
sequence_output, pooled_output = outputs[:2]
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here
outputs = (prediction_scores, seq_relationship_score,) + outputs[
2:
] # add hidden states and attention if they are here
if masked_lm_labels is not None and next_sentence_label is not None:
loss_fct = CrossEntropyLoss()
......@@ -822,9 +915,9 @@ class BertForPreTraining(BertPreTrainedModel):
return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING
)
class BertForMaskedLM(BertPreTrainedModel):
r"""
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -862,6 +955,7 @@ class BertForMaskedLM(BertPreTrainedModel):
loss, prediction_scores = outputs[:2]
"""
def __init__(self, config):
super(BertForMaskedLM, self).__init__(config)
......@@ -873,17 +967,30 @@ class BertForMaskedLM(BertPreTrainedModel):
def get_output_embeddings(self):
return self.cls.predictions.decoder
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ):
outputs = self.bert(input_ids,
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
masked_lm_labels=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
lm_labels=None,
):
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask)
encoder_attention_mask=encoder_attention_mask,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
......@@ -912,9 +1019,11 @@ class BertForMaskedLM(BertPreTrainedModel):
return outputs # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
@add_start_docstrings(
"""Bert Model with a `next sentence prediction (classification)` head on top. """,
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING)
BERT_INPUTS_DOCSTRING,
)
class BertForNextSentencePrediction(BertPreTrainedModel):
r"""
**next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......@@ -945,6 +1054,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
seq_relationship_scores = outputs[0]
"""
def __init__(self, config):
super(BertForNextSentencePrediction, self).__init__(config)
......@@ -953,15 +1063,25 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
next_sentence_label=None):
outputs = self.bert(input_ids,
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
next_sentence_label=None,
):
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
pooled_output = outputs[1]
......@@ -976,10 +1096,12 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
@add_start_docstrings(
"""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING)
BERT_INPUTS_DOCSTRING,
)
class BertForSequenceClassification(BertPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......@@ -1011,6 +1133,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
loss, logits = outputs[:2]
"""
def __init__(self, config):
super(BertForSequenceClassification, self).__init__(config)
self.num_labels = config.num_labels
......@@ -1021,15 +1144,25 @@ class BertForSequenceClassification(BertPreTrainedModel):
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
outputs = self.bert(input_ids,
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
pooled_output = outputs[1]
......@@ -1051,10 +1184,12 @@ class BertForSequenceClassification(BertPreTrainedModel):
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
@add_start_docstrings(
"""Bert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING)
BERT_INPUTS_DOCSTRING,
)
class BertForMultipleChoice(BertPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......@@ -1087,6 +1222,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
loss, classification_scores = outputs[:2]
"""
def __init__(self, config):
super(BertForMultipleChoice, self).__init__(config)
......@@ -1096,8 +1232,16 @@ class BertForMultipleChoice(BertPreTrainedModel):
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
num_choices = input_ids.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1))
......@@ -1105,12 +1249,14 @@ class BertForMultipleChoice(BertPreTrainedModel):
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
outputs = self.bert(input_ids,
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
pooled_output = outputs[1]
......@@ -1128,10 +1274,12 @@ class BertForMultipleChoice(BertPreTrainedModel):
return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
@add_start_docstrings(
"""Bert Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING)
BERT_INPUTS_DOCSTRING,
)
class BertForTokenClassification(BertPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -1161,6 +1309,7 @@ class BertForTokenClassification(BertPreTrainedModel):
loss, scores = outputs[:2]
"""
def __init__(self, config):
super(BertForTokenClassification, self).__init__(config)
self.num_labels = config.num_labels
......@@ -1171,15 +1320,25 @@ class BertForTokenClassification(BertPreTrainedModel):
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
outputs = self.bert(input_ids,
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
sequence_output = outputs[0]
......@@ -1202,10 +1361,12 @@ class BertForTokenClassification(BertPreTrainedModel):
return outputs # (loss), scores, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
@add_start_docstrings(
"""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING)
BERT_INPUTS_DOCSTRING,
)
class BertForQuestionAnswering(BertPreTrainedModel):
r"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......@@ -1247,6 +1408,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
"""
def __init__(self, config):
super(BertForQuestionAnswering, self).__init__(config)
self.num_labels = config.num_labels
......@@ -1256,15 +1418,26 @@ class BertForQuestionAnswering(BertPreTrainedModel):
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
start_positions=None, end_positions=None):
outputs = self.bert(input_ids,
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
):
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
sequence_output = outputs[0]
......
......@@ -15,19 +15,25 @@
# limitations under the License.
"""PyTorch CamemBERT model. """
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification
from .configuration_camembert import CamembertConfig
from .file_utils import add_start_docstrings
from .modeling_roberta import (
RobertaForMaskedLM,
RobertaForMultipleChoice,
RobertaForSequenceClassification,
RobertaForTokenClassification,
RobertaModel,
)
logger = logging.getLogger(__name__)
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin",
"camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin",
}
......@@ -100,8 +106,12 @@ CAMEMBERT_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
CAMEMBERT_START_DOCSTRING,
CAMEMBERT_INPUTS_DOCSTRING,
)
class CamembertModel(RobertaModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -149,8 +159,11 @@ class CamembertModel(RobertaModel):
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@add_start_docstrings("""CamemBERT Model with a `language modeling` head on top. """,
CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"""CamemBERT Model with a `language modeling` head on top. """,
CAMEMBERT_START_DOCSTRING,
CAMEMBERT_INPUTS_DOCSTRING,
)
class CamembertForMaskedLM(RobertaForMaskedLM):
r"""
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -185,9 +198,12 @@ class CamembertForMaskedLM(RobertaForMaskedLM):
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@add_start_docstrings("""CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer
@add_start_docstrings(
"""CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer
on top of the pooled output) e.g. for GLUE tasks. """,
CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
CAMEMBERT_START_DOCSTRING,
CAMEMBERT_INPUTS_DOCSTRING,
)
class CamembertForSequenceClassification(RobertaForSequenceClassification):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......@@ -223,9 +239,12 @@ class CamembertForSequenceClassification(RobertaForSequenceClassification):
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@add_start_docstrings("""CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
@add_start_docstrings(
"""CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
CAMEMBERT_START_DOCSTRING,
CAMEMBERT_INPUTS_DOCSTRING,
)
class CamembertForMultipleChoice(RobertaForMultipleChoice):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -257,9 +276,12 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice):
pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@add_start_docstrings("""CamemBERT Model with a token classification head on top (a linear layer on top of
@add_start_docstrings(
"""CamemBERT Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
CAMEMBERT_START_DOCSTRING,
CAMEMBERT_INPUTS_DOCSTRING,
)
class CamembertForTokenClassification(RobertaForTokenClassification):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......
......@@ -17,22 +17,17 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import json
import logging
import math
import os
import sys
from io import open
import numpy as np
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter
from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
from .configuration_ctrl import CTRLConfig
from .file_utils import add_start_docstrings
from .modeling_utils import Conv1D, PreTrainedModel
logger = logging.getLogger(__name__)
......@@ -40,14 +35,17 @@ CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-
def angle_defn(pos, i, d_model_size):
angle_rates = 1 / torch.pow(10000, (2 * (i//2)) / d_model_size)
angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
return pos * angle_rates
def positional_encoding(position, d_model_size, dtype):
# create the sinusoidal pattern for the positional encoding
angle_rads = (angle_defn(torch.arange(position, dtype=dtype).unsqueeze(1),
angle_rads = angle_defn(
torch.arange(position, dtype=dtype).unsqueeze(1),
torch.arange(d_model_size, dtype=dtype).unsqueeze(0),
d_model_size))
d_model_size,
)
sines = torch.sin(angle_rads[:, 0::2])
cosines = torch.cos(angle_rads[:, 1::2])
......@@ -55,16 +53,17 @@ def positional_encoding(position, d_model_size, dtype):
pos_encoding = torch.cat([sines, cosines], dim=-1)
return pos_encoding
def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
# calculate attention
matmul_qk = torch.matmul(q, k.permute(0,1,3,2))
matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2))
dk = k.shape[-1]
scaled_attention_logits = matmul_qk / np.sqrt(dk)
if mask is not None:
nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
scaled_attention_logits += (mask[ns-nd:ns, :ns] * -1e4)
scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4
if attention_mask is not None:
# Apply the attention mask
......@@ -128,11 +127,8 @@ class MultiHeadAttention(torch.nn.Module):
return outputs
def point_wise_feed_forward_network(d_model_size, dff):
return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff),
torch.nn.ReLU(),
torch.nn.Linear(dff, d_model_size))
return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff), torch.nn.ReLU(), torch.nn.Linear(dff, d_model_size))
class EncoderLayer(torch.nn.Module):
......@@ -150,10 +146,9 @@ class EncoderLayer(torch.nn.Module):
def forward(self, x, mask, layer_past=None, attention_mask=None, head_mask=None):
normed = self.layernorm1(x)
attn_outputs = self.multi_head_attention(normed, normed, normed, mask,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask)
attn_outputs = self.multi_head_attention(
normed, normed, normed, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask
)
attn_output = attn_outputs[0]
attn_output = self.dropout1(attn_output)
out1 = x + attn_output
......@@ -171,6 +166,7 @@ class CTRLPreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = CTRLConfig
pretrained_model_archive_map = CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "transformer"
......@@ -244,8 +240,12 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs:
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
CTRL_START_DOCSTRING,
CTRL_INPUTS_DOCSTRING,
)
class CTRLModel(CTRLPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -273,6 +273,7 @@ class CTRLModel(CTRLPreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config):
super(CTRLModel, self).__init__(config)
self.output_hidden_states = config.output_hidden_states
......@@ -287,11 +288,12 @@ class CTRLModel(CTRLPreTrainedModel):
self.w = nn.Embedding(config.vocab_size, config.n_embd)
self.dropout = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList([EncoderLayer(config.n_embd,
config.n_head,
config.dff,
config.resid_pdrop,
config.output_attentions) for _ in range(config.n_layer)])
self.h = nn.ModuleList(
[
EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop, config.output_attentions)
for _ in range(config.n_layer)
]
)
self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
self.init_weights()
......@@ -309,7 +311,16 @@ class CTRLModel(CTRLPreTrainedModel):
for layer, heads in heads_to_prune.items():
self.h[layer].attn.prune_heads(heads)
def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
def forward(
self,
input_ids=None,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
):
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
......@@ -357,8 +368,12 @@ class CTRLModel(CTRLPreTrainedModel):
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
head_mask = (
head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
) # We can specify head_mask for each layer
head_mask = head_mask.to(
dtype=next(self.parameters()).dtype
) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.n_layer
......@@ -391,11 +406,9 @@ class CTRLModel(CTRLPreTrainedModel):
for i, (h, layer_past) in enumerate(zip(self.h, past)):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
outputs = h(hidden_states,
mask,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask[i])
outputs = h(
hidden_states, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i]
)
hidden_states, present = outputs[:2]
if self.output_past:
presents = presents + (present,)
......@@ -421,8 +434,12 @@ class CTRLModel(CTRLPreTrainedModel):
return outputs
@add_start_docstrings("""The CTRL Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
@add_start_docstrings(
"""The CTRL Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
CTRL_START_DOCSTRING,
CTRL_INPUTS_DOCSTRING,
)
class CTRLLMHeadModel(CTRLPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -463,6 +480,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
loss, logits = outputs[:2]
"""
def __init__(self, config):
super(CTRLLMHeadModel, self).__init__(config)
self.transformer = CTRLModel(config)
......@@ -473,15 +491,26 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
def get_output_embeddings(self):
return self.lm_head
def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
labels=None):
transformer_outputs = self.transformer(input_ids,
def forward(
self,
input_ids=None,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
transformer_outputs = self.transformer(
input_ids,
past=past,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
hidden_states = transformer_outputs[0]
......@@ -495,8 +524,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
outputs = (loss,) + outputs
return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions)
......@@ -18,60 +18,53 @@
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import copy
import logging
import math
import copy
import sys
from io import open
import itertools
import numpy as np
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from .modeling_utils import PreTrainedModel, prune_linear_layer
from .configuration_distilbert import DistilBertConfig
from .file_utils import add_start_docstrings
from .modeling_utils import PreTrainedModel, prune_linear_layer
import logging
logger = logging.getLogger(__name__)
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
"distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
"distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
"distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
"distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
}
### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
def gelu(x):
return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
def create_sinusoidal_embeddings(n_pos, dim, out):
position_enc = np.array([
[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
for pos in range(n_pos)
])
position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
out.detach_()
out.requires_grad = False
class Embeddings(nn.Module):
def __init__(self,
config):
def __init__(self, config):
super(Embeddings, self).__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
if config.sinusoidal_pos_embds:
create_sinusoidal_embeddings(n_pos=config.max_position_embeddings,
dim=config.dim,
out=self.position_embeddings.weight)
create_sinusoidal_embeddings(
n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
)
self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
self.dropout = nn.Dropout(config.dropout)
......@@ -100,6 +93,7 @@ class Embeddings(nn.Module):
embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim)
return embeddings
class MultiHeadSelfAttention(nn.Module):
def __init__(self, config):
super(MultiHeadSelfAttention, self).__init__()
......@@ -139,7 +133,7 @@ class MultiHeadSelfAttention(nn.Module):
self.dim = attention_head_size * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, query, key, value, mask, head_mask = None):
def forward(self, query, key, value, mask, head_mask=None):
"""
Parameters
----------
......@@ -177,9 +171,9 @@ class MultiHeadSelfAttention(nn.Module):
v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head)
q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head)
scores = torch.matmul(q, k.transpose(2,3)) # (bs, n_heads, q_length, k_length)
mask = (mask==0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length)
scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, q_length, k_length)
scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, q_length, k_length)
mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length)
scores.masked_fill_(mask, -float("inf")) # (bs, n_heads, q_length, k_length)
weights = nn.Softmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length)
weights = self.dropout(weights) # (bs, n_heads, q_length, k_length)
......@@ -197,14 +191,17 @@ class MultiHeadSelfAttention(nn.Module):
else:
return (context,)
class FFN(nn.Module):
def __init__(self, config):
super(FFN, self).__init__()
self.dropout = nn.Dropout(p=config.dropout)
self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
self.activation = gelu if config.activation == 'gelu' else nn.ReLU()
assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format(
config.activation
)
self.activation = gelu if config.activation == "gelu" else nn.ReLU()
def forward(self, input):
x = self.lin1(input)
......@@ -213,6 +210,7 @@ class FFN(nn.Module):
x = self.dropout(x)
return x
class TransformerBlock(nn.Module):
def __init__(self, config):
super(TransformerBlock, self).__init__()
......@@ -303,9 +301,7 @@ class Transformer(nn.Module):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,)
layer_outputs = layer_module(x=hidden_state,
attn_mask=attn_mask,
head_mask=head_mask[i])
layer_outputs = layer_module(x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i])
hidden_state = layer_outputs[-1]
if self.output_attentions:
......@@ -327,11 +323,12 @@ class Transformer(nn.Module):
return outputs # last-layer hidden state, (all hidden states), (all attentions)
### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class DistilBertPreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""
config_class = DistilBertConfig
pretrained_model_archive_map = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
load_tf_weights = None
......@@ -396,8 +393,12 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
DISTILBERT_START_DOCSTRING,
DISTILBERT_INPUTS_DOCSTRING,
)
class DistilBertModel(DistilBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -420,6 +421,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config):
super(DistilBertModel, self).__init__(config)
......@@ -442,8 +444,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
for layer, heads in heads_to_prune.items():
self.transformer.layer[layer].attention.prune_heads(heads)
def forward(self,
input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None):
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None):
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
......@@ -468,24 +469,29 @@ class DistilBertModel(DistilBertPreTrainedModel):
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
head_mask = (
head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
) # We can specify head_mask for each layer
head_mask = head_mask.to(
dtype=next(self.parameters()).dtype
) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.num_hidden_layers
if inputs_embeds is None:
inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim)
tfmr_output = self.transformer(x=inputs_embeds,
attn_mask=attention_mask,
head_mask=head_mask)
tfmr_output = self.transformer(x=inputs_embeds, attn_mask=attention_mask, head_mask=head_mask)
hidden_state = tfmr_output[0]
output = (hidden_state, ) + tfmr_output[1:]
output = (hidden_state,) + tfmr_output[1:]
return output # last-layer hidden-state, (all hidden_states), (all attentions)
@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"""DistilBert Model with a `masked language modeling` head on top. """,
DISTILBERT_START_DOCSTRING,
DISTILBERT_INPUTS_DOCSTRING,
)
class DistilBertForMaskedLM(DistilBertPreTrainedModel):
r"""
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -516,6 +522,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
loss, prediction_scores = outputs[:2]
"""
def __init__(self, config):
super(DistilBertForMaskedLM, self).__init__(config)
self.output_attentions = config.output_attentions
......@@ -534,28 +541,31 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
return self.vocab_projector
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None):
dlbrt_output = self.distilbert(input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
dlbrt_output = self.distilbert(
input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
)
hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim)
prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim)
prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size)
outputs = (prediction_logits, ) + dlbrt_output[1:]
outputs = (prediction_logits,) + dlbrt_output[1:]
if masked_lm_labels is not None:
mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)),
masked_lm_labels.view(-1))
mlm_loss = self.mlm_loss_fct(
prediction_logits.view(-1, prediction_logits.size(-1)), masked_lm_labels.view(-1)
)
outputs = (mlm_loss,) + outputs
return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)
@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
@add_start_docstrings(
"""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
DISTILBERT_START_DOCSTRING,
DISTILBERT_INPUTS_DOCSTRING,
)
class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......@@ -587,6 +597,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
loss, logits = outputs[:2]
"""
def __init__(self, config):
super(DistilBertForSequenceClassification, self).__init__(config)
self.num_labels = config.num_labels
......@@ -599,10 +610,9 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
distilbert_output = self.distilbert(input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
distilbert_output = self.distilbert(
input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
)
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
pooled_output = hidden_state[:, 0] # (bs, dim)
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
......@@ -623,9 +633,12 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
@add_start_docstrings(
"""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
DISTILBERT_START_DOCSTRING,
DISTILBERT_INPUTS_DOCSTRING,
)
class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
r"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......@@ -663,6 +676,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
loss, start_scores, end_scores = outputs[:3]
"""
def __init__(self, config):
super(DistilBertForQuestionAnswering, self).__init__(config)
......@@ -673,11 +687,18 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
distilbert_output = self.distilbert(input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
def forward(
self,
input_ids=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
):
distilbert_output = self.distilbert(
input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
)
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim)
......@@ -707,10 +728,12 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
@add_start_docstrings(
"""DistilBert Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
DISTILBERT_START_DOCSTRING,
DISTILBERT_INPUTS_DOCSTRING)
DISTILBERT_INPUTS_DOCSTRING,
)
class DistilBertForTokenClassification(DistilBertPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -740,6 +763,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
loss, scores = outputs[:2]
"""
def __init__(self, config):
super(DistilBertForTokenClassification, self).__init__(config)
self.num_labels = config.num_labels
......@@ -750,13 +774,11 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, head_mask=None,
inputs_embeds=None, labels=None):
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None):
outputs = self.distilbert(input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
outputs = self.distilbert(
input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
)
sequence_output = outputs[0]
......
......@@ -18,14 +18,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import logging
import os
import warnings
import torch
from torch import nn
from tqdm import trange
from .modeling_auto import AutoModel, AutoModelWithLMHead
logger = logging.getLogger(__name__)
......@@ -145,16 +144,12 @@ class PreTrainedEncoderDecoder(nn.Module):
# by the value of the flag `is_decoder` that we need to set correctly.
encoder = kwargs_encoder.pop("model", None)
if encoder is None:
encoder = AutoModel.from_pretrained(
encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder
)
encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
encoder.config.is_decoder = False
decoder = kwargs_decoder.pop("model", None)
if decoder is None:
decoder = AutoModelWithLMHead.from_pretrained(
decoder_pretrained_model_name_or_path, **kwargs_decoder
)
decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
decoder.config.is_decoder = True
model = cls(encoder, decoder)
......@@ -173,13 +168,18 @@ class PreTrainedEncoderDecoder(nn.Module):
os.mkdir(save_directory)
# Check whether the output directory is empty or not
sub_directories = [directory for directory in os.listdir(save_directory)
if os.path.isdir(os.path.join(save_directory, directory))]
sub_directories = [
directory
for directory in os.listdir(save_directory)
if os.path.isdir(os.path.join(save_directory, directory))
]
if len(sub_directories) > 0:
if "encoder" in sub_directories and "decoder" in sub_directories:
print("WARNING: there is an older version of encoder-decoder saved in" +\
" the output directory. The default behaviour is to overwrite them.")
print(
"WARNING: there is an older version of encoder-decoder saved in"
+ " the output directory. The default behaviour is to overwrite them."
)
# Empty the output directory
for directory_to_remove in sub_directories:
......@@ -190,7 +190,7 @@ class PreTrainedEncoderDecoder(nn.Module):
# Remove the subdirectory itself
os.rmdir(os.path.join(save_directory, directory_to_remove))
assert(len(os.listdir(save_directory)) == 0) # sanity check
assert len(os.listdir(save_directory)) == 0 # sanity check
# Create the "encoder" directory inside the output directory and save the encoder into it
if not os.path.exists(os.path.join(save_directory, "encoder")):
......
......@@ -17,41 +17,41 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import json
import logging
import math
import os
import sys
from io import open
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter
from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
from .configuration_gpt2 import GPT2Config
from .file_utils import add_start_docstrings
from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
logger = logging.getLogger(__name__)
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {
"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
"gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-pytorch_model.bin",
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",
}
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
""" Load tf checkpoints in a pytorch model
"""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.")
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(gpt2_checkpoint_path)
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
......@@ -67,24 +67,24 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
for name, array in zip(names, arrays):
name = name[6:] # skip "model/"
name = name.split('/')
name = name.split("/")
pointer = model
for m_name in name:
if re.fullmatch(r'[A-Za-z]+\d+', m_name):
l = re.split(r'(\d+)', m_name)
if re.fullmatch(r"[A-Za-z]+\d+", m_name):
scope_names = re.split(r"(\d+)", m_name)
else:
l = [m_name]
if l[0] == 'w' or l[0] == 'g':
pointer = getattr(pointer, 'weight')
elif l[0] == 'b':
pointer = getattr(pointer, 'bias')
elif l[0] == 'wpe' or l[0] == 'wte':
pointer = getattr(pointer, l[0])
pointer = getattr(pointer, 'weight')
scope_names = [m_name]
if scope_names[0] == "w" or scope_names[0] == "g":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "b":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "wpe" or scope_names[0] == "wte":
pointer = getattr(pointer, scope_names[0])
pointer = getattr(pointer, "weight")
else:
pointer = getattr(pointer, l[0])
if len(l) >= 2:
num = int(l[1])
pointer = getattr(pointer, scope_names[0])
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
try:
assert pointer.shape == array.shape
......@@ -130,7 +130,7 @@ class Attention(nn.Module):
mask[head] = 0
mask = mask.view(-1).contiguous().eq(1)
index = torch.arange(len(mask))[mask].long()
index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
# Prune conv1d layers
self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
......@@ -146,7 +146,7 @@ class Attention(nn.Module):
if self.scale:
w = w / math.sqrt(v.size(-1))
nd, ns = w.size(-2), w.size(-1)
b = self.bias[:, :, ns-nd:ns, :ns]
b = self.bias[:, :, ns - nd : ns, :ns]
w = w * b - 1e4 * (1 - b)
if attention_mask is not None:
......@@ -226,10 +226,9 @@ class Block(nn.Module):
self.mlp = MLP(4 * nx, config)
def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
output_attn = self.attn(self.ln_1(x),
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask)
output_attn = self.attn(
self.ln_1(x), layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask
)
a = output_attn[0] # output_attn: a, present, (attentions)
x = x + a
......@@ -244,6 +243,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = GPT2Config
pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
load_tf_weights = load_tf_weights_in_gpt2
......@@ -321,8 +321,12 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
GPT2_START_DOCSTRING,
GPT2_INPUTS_DOCSTRING,
)
class GPT2Model(GPT2PreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -350,6 +354,7 @@ class GPT2Model(GPT2PreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config):
super(GPT2Model, self).__init__(config)
self.output_hidden_states = config.output_hidden_states
......@@ -377,7 +382,16 @@ class GPT2Model(GPT2PreTrainedModel):
for layer, heads in heads_to_prune.items():
self.h[layer].attn.prune_heads(heads)
def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
def forward(
self,
input_ids=None,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
):
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
......@@ -430,8 +444,12 @@ class GPT2Model(GPT2PreTrainedModel):
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
head_mask = (
head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
) # We can specify head_mask for each layer
head_mask = head_mask.to(
dtype=next(self.parameters()).dtype
) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.n_layer
......@@ -454,10 +472,9 @@ class GPT2Model(GPT2PreTrainedModel):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
outputs = block(hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask[i])
outputs = block(
hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i]
)
hidden_states, present = outputs[:2]
if self.output_past:
......@@ -486,8 +503,12 @@ class GPT2Model(GPT2PreTrainedModel):
return outputs # last hidden state, (presents), (all hidden_states), (attentions)
@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
@add_start_docstrings(
"""The GPT2 Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
GPT2_START_DOCSTRING,
GPT2_INPUTS_DOCSTRING,
)
class GPT2LMHeadModel(GPT2PreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -528,6 +549,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
loss, logits = outputs[:2]
"""
def __init__(self, config):
super(GPT2LMHeadModel, self).__init__(config)
self.transformer = GPT2Model(config)
......@@ -538,15 +560,26 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
def get_output_embeddings(self):
return self.lm_head
def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
labels=None):
transformer_outputs = self.transformer(input_ids,
def forward(
self,
input_ids=None,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
transformer_outputs = self.transformer(
input_ids,
past=past,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states)
......@@ -558,18 +591,21 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
outputs = (loss,) + outputs
return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions)
@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
@add_start_docstrings(
"""The GPT2 Model transformer with a language modeling and a multiple-choice classification
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence).
""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
""",
GPT2_START_DOCSTRING,
GPT2_INPUTS_DOCSTRING,
)
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
r"""
**mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
......@@ -632,6 +668,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
def __init__(self, config):
super(GPT2DoubleHeadsModel, self).__init__(config)
config.num_labels = 1
......@@ -644,15 +681,28 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
def get_output_embeddings(self):
return self.lm_head
def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
mc_token_ids=None, lm_labels=None, mc_labels=None):
transformer_outputs = self.transformer(input_ids,
def forward(
self,
input_ids=None,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
mc_token_ids=None,
lm_labels=None,
mc_labels=None,
):
transformer_outputs = self.transformer(
input_ids,
past=past,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
hidden_states = transformer_outputs[0]
......@@ -662,15 +712,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
if mc_labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
mc_labels.view(-1))
loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
outputs = (loss,) + outputs
if lm_labels is not None:
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = lm_labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
outputs = (loss,) + outputs
return outputs # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
......@@ -15,8 +15,7 @@
# limitations under the License.
"""PyTorch MMBT model. """
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
......@@ -26,12 +25,14 @@ from torch.nn import CrossEntropyLoss, MSELoss
from .file_utils import add_start_docstrings
logger = logging.getLogger(__name__)
class ModalEmbeddings(nn.Module):
"""Generic Modal Embeddings which takes in an encoder, and a transformer embedding.
"""
def __init__(self, config, encoder, embeddings):
super(ModalEmbeddings, self).__init__()
self.config = config
......@@ -62,7 +63,9 @@ class ModalEmbeddings(nn.Module):
position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length)
if token_type_ids is None:
token_type_ids = torch.zeros((input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device)
token_type_ids = torch.zeros(
(input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device
)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
......@@ -140,8 +143,12 @@ MMBT_INPUTS_DOCSTRING = r""" Inputs:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
"""
@add_start_docstrings("The bare MMBT Model outputting raw hidden-states without any specific head on top.",
MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare MMBT Model outputting raw hidden-states without any specific head on top.",
MMBT_START_DOCSTRING,
MMBT_INPUTS_DOCSTRING,
)
class MMBTModel(nn.Module):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -167,19 +174,29 @@ class MMBTModel(nn.Module):
encoder = ImageEncoder(args)
mmbt = MMBTModel(config, transformer, encoder)
"""
def __init__(self, config, transformer, encoder):
super(MMBTModel, self).__init__()
self.config = config
self.transformer = transformer
self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings)
def forward(self, input_modal, input_ids=None, modal_start_tokens=None,
modal_end_tokens=None, attention_mask=None,
token_type_ids=None, modal_token_type_ids=None,
position_ids=None, modal_position_ids=None, head_mask=None,
inputs_embeds=None, encoder_hidden_states=None,
encoder_attention_mask=None):
def forward(
self,
input_modal,
input_ids=None,
modal_start_tokens=None,
modal_end_tokens=None,
attention_mask=None,
token_type_ids=None,
modal_token_type_ids=None,
position_ids=None,
modal_position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
):
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
......@@ -192,21 +209,22 @@ class MMBTModel(nn.Module):
device = input_ids.device if input_ids is not None else inputs_embeds.device
modal_embeddings = self.modal_encoder(input_modal,
modal_embeddings = self.modal_encoder(
input_modal,
start_token=modal_start_tokens,
end_token=modal_end_tokens,
position_ids=modal_position_ids,
token_type_ids=modal_token_type_ids)
token_type_ids=modal_token_type_ids,
)
input_modal_shape = modal_embeddings.size()[:-1]
if token_type_ids is None:
token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device)
txt_embeddings = self.transformer.embeddings(input_ids=input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds)
txt_embeddings = self.transformer.embeddings(
input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
)
embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1)
......@@ -215,12 +233,16 @@ class MMBTModel(nn.Module):
if attention_mask is None:
attention_mask = torch.ones(input_shape, device=device)
else:
attention_mask = torch.cat([torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1)
attention_mask = torch.cat(
[torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1
)
if encoder_attention_mask is None:
encoder_attention_mask = torch.ones(input_shape, device=device)
else:
encoder_attention_mask = torch.cat([torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1)
encoder_attention_mask = torch.cat(
[torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1
)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
......@@ -254,7 +276,9 @@ class MMBTModel(nn.Module):
if encoder_attention_mask.dim() == 2:
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
encoder_extended_attention_mask = encoder_extended_attention_mask.to(
dtype=next(self.parameters()).dtype
) # fp16 compatibility
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
# Prepare head mask if needed
......@@ -267,25 +291,31 @@ class MMBTModel(nn.Module):
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
head_mask = (
head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
) # We can specify head_mask for each layer
head_mask = head_mask.to(
dtype=next(self.parameters()).dtype
) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.num_hidden_layers
encoder_outputs = self.transformer.encoder(embedding_output,
encoder_outputs = self.transformer.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask)
encoder_attention_mask=encoder_extended_attention_mask,
)
sequence_output = encoder_outputs[0]
pooled_output = self.transformer.pooler(sequence_output)
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here
outputs = (sequence_output, pooled_output,) + encoder_outputs[
1:
] # add hidden_states and attentions if they are here
return outputs # sequence_output, pooled_output, (hidden_states), (attentions)
def get_input_embeddings(self):
return self.embeddings.word_embeddings
......@@ -293,8 +323,12 @@ class MMBTModel(nn.Module):
self.embeddings.word_embeddings = value
@add_start_docstrings("""MMBT Model with a sequence classification/regression head on top (a linear layer on top of
the pooled output)""", MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING)
@add_start_docstrings(
"""MMBT Model with a sequence classification/regression head on top (a linear layer on top of
the pooled output)""",
MMBT_START_DOCSTRING,
MMBT_INPUTS_DOCSTRING,
)
class MMBTForClassification(nn.Module):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......@@ -333,11 +367,25 @@ class MMBTForClassification(nn.Module):
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, input_modal, input_ids=None, modal_start_tokens=None, modal_end_tokens=None,
attention_mask=None, token_type_ids=None, modal_token_type_ids=None, position_ids=None,
modal_position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
outputs = self.mmbt(input_modal=input_modal, input_ids=input_ids,
def forward(
self,
input_modal,
input_ids=None,
modal_start_tokens=None,
modal_end_tokens=None,
attention_mask=None,
token_type_ids=None,
modal_token_type_ids=None,
position_ids=None,
modal_position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
outputs = self.mmbt(
input_modal=input_modal,
input_ids=input_ids,
modal_start_tokens=modal_start_tokens,
modal_end_tokens=modal_end_tokens,
attention_mask=attention_mask,
......@@ -346,7 +394,8 @@ class MMBTForClassification(nn.Module):
position_ids=position_ids,
modal_position_ids=modal_position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
pooled_output = outputs[1]
......
......@@ -17,26 +17,26 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import json
import logging
import math
import os
import sys
from io import open
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter
from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_start_docstrings
from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
logger = logging.getLogger(__name__)
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {
"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"
}
def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
......@@ -45,17 +45,17 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
import re
import numpy as np
if '.ckpt' in openai_checkpoint_folder_path:
if ".ckpt" in openai_checkpoint_folder_path:
openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path)
logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
with open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8') as names_handle:
with open(openai_checkpoint_folder_path + "/parameters_names.json", "r", encoding="utf-8") as names_handle:
names = json.load(names_handle)
with open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8') as shapes_handle:
with open(openai_checkpoint_folder_path + "/params_shapes.json", "r", encoding="utf-8") as shapes_handle:
shapes = json.load(shapes_handle)
offsets = np.cumsum([np.prod(shape) for shape in shapes])
init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
init_params = [np.load(openai_checkpoint_folder_path + "/params_{}.npy".format(n)) for n in range(10)]
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
......@@ -83,23 +83,23 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
name = name[6:] # skip "model/"
assert name[-2:] == ":0"
name = name[:-2]
name = name.split('/')
name = name.split("/")
pointer = model
for m_name in name:
if re.fullmatch(r'[A-Za-z]+\d+', m_name):
l = re.split(r'(\d+)', m_name)
if re.fullmatch(r"[A-Za-z]+\d+", m_name):
scope_names = re.split(r"(\d+)", m_name)
else:
l = [m_name]
if l[0] == 'g':
pointer = getattr(pointer, 'weight')
elif l[0] == 'b':
pointer = getattr(pointer, 'bias')
elif l[0] == 'w':
pointer = getattr(pointer, 'weight')
scope_names = [m_name]
if scope_names[0] == "g":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "b":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "w":
pointer = getattr(pointer, "weight")
else:
pointer = getattr(pointer, l[0])
if len(l) >= 2:
num = int(l[1])
pointer = getattr(pointer, scope_names[0])
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
try:
assert pointer.shape == array.shape
......@@ -156,7 +156,7 @@ class Attention(nn.Module):
mask[head] = 0
mask = mask.view(-1).contiguous().eq(1)
index = torch.arange(len(mask))[mask].long()
index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
# Prune conv1d layers
self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
......@@ -172,7 +172,7 @@ class Attention(nn.Module):
# w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights
# XD: self.b may be larger than w, so we need to crop it
b = self.bias[:, :, : w.size(-2), : w.size(-1)]
w = w * b + - 1e4 * (1 - b)
w = w * b + -1e4 * (1 - b)
if attention_mask is not None:
# Apply the attention mask
......@@ -261,6 +261,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = OpenAIGPTConfig
pretrained_model_archive_map = OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
load_tf_weights = load_tf_weights_in_openai_gpt
......@@ -330,8 +331,12 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
OPENAI_GPT_START_DOCSTRING,
OPENAI_GPT_INPUTS_DOCSTRING,
)
class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -354,6 +359,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config):
super(OpenAIGPTModel, self).__init__(config)
self.output_attentions = config.output_attentions
......@@ -379,7 +385,15 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
for layer, heads in heads_to_prune.items():
self.h[layer].attn.prune_heads(heads)
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None):
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
):
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
......@@ -422,8 +436,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
head_mask = (
head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
) # We can specify head_mask for each layer
head_mask = head_mask.to(
dtype=next(self.parameters()).dtype
) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.n_layer
......@@ -463,8 +481,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
return outputs # last hidden state, (all hidden states), (all attentions)
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
@add_start_docstrings(
"""OpenAI GPT Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
OPENAI_GPT_START_DOCSTRING,
OPENAI_GPT_INPUTS_DOCSTRING,
)
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -496,6 +518,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
loss, logits = outputs[:2]
"""
def __init__(self, config):
super(OpenAIGPTLMHeadModel, self).__init__(config)
self.transformer = OpenAIGPTModel(config)
......@@ -506,14 +529,24 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
def get_output_embeddings(self):
return self.lm_head
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
labels=None):
transformer_outputs = self.transformer(input_ids,
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states)
......@@ -524,18 +557,21 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
outputs = (loss,) + outputs
return outputs # (loss), lm_logits, (all hidden states), (all attentions)
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
@add_start_docstrings(
"""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence).
""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
""",
OPENAI_GPT_START_DOCSTRING,
OPENAI_GPT_INPUTS_DOCSTRING,
)
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
r"""
**mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
......@@ -587,6 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
def __init__(self, config):
super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
......@@ -600,14 +637,26 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
def get_output_embeddings(self):
return self.lm_head
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
mc_token_ids=None, lm_labels=None, mc_labels=None):
transformer_outputs = self.transformer(input_ids,
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
mc_token_ids=None,
lm_labels=None,
mc_labels=None,
):
transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states)
......@@ -616,15 +665,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
if mc_labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
mc_labels.view(-1))
loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
outputs = (loss,) + outputs
if lm_labels is not None:
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = lm_labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
outputs = (loss,) + outputs
return outputs # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions)
......@@ -15,8 +15,7 @@
# limitations under the License.
"""PyTorch RoBERTa model. """
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
......@@ -24,31 +23,35 @@ import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
from .configuration_roberta import RobertaConfig
from .file_utils import add_start_docstrings
from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
logger = logging.getLogger(__name__)
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
"roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
"roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
"roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
"distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
"roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
"roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
}
class RobertaEmbeddings(BertEmbeddings):
"""
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
"""
def __init__(self, config):
super(RobertaEmbeddings, self).__init__(config)
self.padding_idx = 1
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
padding_idx=self.padding_idx)
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
)
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
if position_ids is None:
......@@ -58,10 +61,9 @@ class RobertaEmbeddings(BertEmbeddings):
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
return super(RobertaEmbeddings, self).forward(input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
inputs_embeds=inputs_embeds)
return super(RobertaEmbeddings, self).forward(
input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds
)
def create_position_ids_from_input_ids(self, x):
""" Replace non-padding symbols with their position numbers. Position numbers begin at
......@@ -85,8 +87,9 @@ class RobertaEmbeddings(BertEmbeddings):
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(self.padding_idx+1, sequence_length+self.padding_idx+1, dtype=torch.long,
device=inputs_embeds.device)
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape)
......@@ -162,8 +165,12 @@ ROBERTA_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
ROBERTA_START_DOCSTRING,
ROBERTA_INPUTS_DOCSTRING,
)
class RobertaModel(BertModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -209,8 +216,10 @@ class RobertaModel(BertModel):
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
@add_start_docstrings(
"""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING
)
class RobertaForMaskedLM(BertPreTrainedModel):
r"""
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -256,14 +265,24 @@ class RobertaForMaskedLM(BertPreTrainedModel):
def get_output_embeddings(self):
return self.lm_head.decoder
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
masked_lm_labels=None):
outputs = self.roberta(input_ids,
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
masked_lm_labels=None,
):
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output)
......@@ -299,9 +318,12 @@ class RobertaLMHead(nn.Module):
return x
@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
@add_start_docstrings(
"""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
on top of the pooled output) e.g. for GLUE tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
ROBERTA_START_DOCSTRING,
ROBERTA_INPUTS_DOCSTRING,
)
class RobertaForSequenceClassification(BertPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......@@ -344,14 +366,24 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
self.roberta = RobertaModel(config)
self.classifier = RobertaClassificationHead(config)
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
labels=None):
outputs = self.roberta(input_ids,
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output)
......@@ -369,9 +401,12 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of
@add_start_docstrings(
"""Roberta Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
ROBERTA_START_DOCSTRING,
ROBERTA_INPUTS_DOCSTRING,
)
class RobertaForMultipleChoice(BertPreTrainedModel):
r"""
Inputs:
......@@ -455,16 +490,29 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
self.init_weights()
def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None,
position_ids=None, head_mask=None, inputs_embeds=None):
def forward(
self,
input_ids=None,
token_type_ids=None,
attention_mask=None,
labels=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
):
num_choices = input_ids.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
outputs = self.roberta(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
attention_mask=flat_attention_mask, head_mask=head_mask)
outputs = self.roberta(
flat_input_ids,
position_ids=flat_position_ids,
token_type_ids=flat_token_type_ids,
attention_mask=flat_attention_mask,
head_mask=head_mask,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
......@@ -481,9 +529,12 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
@add_start_docstrings("""Roberta Model with a token classification head on top (a linear layer on top of
@add_start_docstrings(
"""Roberta Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
ROBERTA_START_DOCSTRING,
ROBERTA_INPUTS_DOCSTRING,
)
class RobertaForTokenClassification(BertPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -527,15 +578,25 @@ class RobertaForTokenClassification(BertPreTrainedModel):
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
outputs = self.roberta(input_ids,
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
):
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
inputs_embeds=inputs_embeds,
)
sequence_output = outputs[0]
......@@ -577,9 +638,12 @@ class RobertaClassificationHead(nn.Module):
return x
@add_start_docstrings("""Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
@add_start_docstrings(
"""Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
ROBERTA_START_DOCSTRING,
ROBERTA_INPUTS_DOCSTRING,
)
class RobertaForQuestionAnswering(BertPreTrainedModel):
r"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......@@ -626,14 +690,24 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
self.init_weights()
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
start_positions=None, end_positions=None):
outputs = self.roberta(input_ids,
def forward(
self,
input_ids,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
start_positions=None,
end_positions=None,
):
outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
head_mask=head_mask,
)
sequence_output = outputs[0]
......
......@@ -16,23 +16,21 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import copy
import itertools
import logging
import math
import os
import sys
import copy
import itertools
from io import open
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss, MSELoss
from torch import nn
from torch.nn import CrossEntropyLoss
from .modeling_utils import PreTrainedModel, prune_linear_layer
from .configuration_t5 import T5Config
from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings
from .modeling_utils import PreTrainedModel, prune_linear_layer
logger = logging.getLogger(__name__)
......@@ -41,13 +39,14 @@ logger = logging.getLogger(__name__)
# for the pretrained weights provided with the models
####################################################
T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
"t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
"t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
"t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
"t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
"t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
}
####################################################
# This is a conversion method from TF 1.0 to PyTorch
# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
......@@ -60,8 +59,10 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
import numpy as np
import tensorflow as tf
except ImportError:
logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.")
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
......@@ -76,44 +77,44 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
tf_weights[name] = array
for txt_name in names:
name = txt_name.split('/')
name = txt_name.split("/")
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
logger.info("Skipping {}".format("/".join(name)))
tf_weights.pop(txt_name, None)
continue
if '_slot_' in name[-1]:
if "_slot_" in name[-1]:
logger.info("Skipping {}".format("/".join(name)))
tf_weights.pop(txt_name, None)
continue
pointer = model
array = tf_weights[txt_name]
for m_name in name:
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
l = re.split(r'_(\d+)', m_name)
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
l = [m_name]
if l[0] in ['kernel', 'scale', 'embedding']:
pointer = getattr(pointer, 'weight')
# elif l[0] == 'scale':
scope_names = [m_name]
if scope_names[0] in ["kernel", "scale", "embedding"]:
pointer = getattr(pointer, "weight")
# elif scope_names[0] == 'scale':
# pointer = getattr(pointer, 'weight')
# elif l[0] == 'output_bias' or l[0] == 'beta':
# elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
# pointer = getattr(pointer, 'bias')
# elif l[0] == 'squad':
# elif scope_names[0] == 'squad':
# pointer = getattr(pointer, 'classifier')
else:
try:
pointer = getattr(pointer, l[0])
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info("Skipping {}".format("/".join(name)))
continue
if len(l) >= 2:
num = int(l[1])
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if l[0] not in ['kernel', 'scale', 'embedding']:
pointer = getattr(pointer, 'weight')
if l[0] != 'embedding':
if scope_names[0] not in ["kernel", "scale", "embedding"]:
pointer = getattr(pointer, "weight")
if scope_names[0] != "embedding":
logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
array = np.transpose(array)
try:
......@@ -125,7 +126,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
pointer.data = torch.from_numpy(array.astype(np.float32))
tf_weights.pop(txt_name, None)
logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
# logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
return model
......@@ -136,6 +137,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
####################################################
class T5LayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
""" Construct a layernorm module in the T5 style
......@@ -228,10 +230,7 @@ class T5Attention(nn.Module):
self.pruned_heads = self.pruned_heads.union(heads)
@staticmethod
def _relative_position_bucket(relative_position,
bidirectional=True,
num_buckets=32,
max_distance=128):
def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
"""
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
......@@ -267,12 +266,12 @@ class T5Attention(nn.Module):
# half of the buckets are for exact increments in positions
max_exact = num_buckets // 2
is_small = (n < max_exact)
is_small = n < max_exact
# The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
val_if_large = max_exact + (
torch.log(n.float() / max_exact)
/ math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long)
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).to(torch.long)
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
......@@ -283,9 +282,11 @@ class T5Attention(nn.Module):
context_position = torch.arange(qlen, dtype=torch.long)[:, None]
memory_position = torch.arange(klen, dtype=torch.long)[None, :]
relative_position = memory_position - context_position # shape (qlen, klen)
rp_bucket = self._relative_position_bucket(relative_position, # shape (qlen, klen)
rp_bucket = self._relative_position_bucket(
relative_position, # shape (qlen, klen)
bidirectional=not self.is_decoder,
num_buckets=self.relative_attention_num_buckets)
num_buckets=self.relative_attention_num_buckets,
)
values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads)
values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen)
return values
......@@ -298,7 +299,7 @@ class T5Attention(nn.Module):
# Mask is (bs, klen) (non-causal) or (bs, klen, klen)
bs, qlen, dim = input.size()
if kv is None:
klen = qlen if cache is None else cache['slen'] + qlen
klen = qlen if cache is None else cache["slen"] + qlen
else:
klen = kv.size(1)
......@@ -330,7 +331,7 @@ class T5Attention(nn.Module):
cache[self.layer_id] = (k, v)
# q = q / math.sqrt(dim_per_head) # No scaling in T5
scores = torch.einsum('bnqd,bnkd->bnqk', q, k) # (bs, n_heads, qlen, klen)
scores = torch.einsum("bnqd,bnkd->bnqk", q, k) # (bs, n_heads, qlen, klen)
if position_bias is None:
if not self.has_relative_attention_bias:
......@@ -369,10 +370,9 @@ class T5LayerSelfAttention(nn.Module):
def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
norm_x = self.layer_norm(hidden_states)
attention_output = self.SelfAttention(norm_x,
mask=attention_mask,
position_bias=position_bias,
head_mask=head_mask)
attention_output = self.SelfAttention(
norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask
)
y = attention_output[0]
layer_output = hidden_states + self.dropout(y)
outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
......@@ -388,11 +388,9 @@ class T5LayerCrossAttention(nn.Module):
def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
norm_x = self.layer_norm(hidden_states)
attention_output = self.EncDecAttention(norm_x,
mask=attention_mask,
kv=kv,
position_bias=position_bias,
head_mask=head_mask)
attention_output = self.EncDecAttention(
norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask
)
y = attention_output[0]
layer_output = hidden_states + self.dropout(y)
outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
......@@ -411,26 +409,36 @@ class T5Block(nn.Module):
else:
self.layer.append(T5LayerFF(config))
def forward(self, hidden_states, attention_mask=None, position_bias=None,
encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
head_mask=None):
self_attention_outputs = self.layer[0](hidden_states,
attention_mask=attention_mask,
position_bias=position_bias,
head_mask=head_mask)
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
encoder_decoder_position_bias=None,
head_mask=None,
):
self_attention_outputs = self.layer[0](
hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask
)
hidden_states = self_attention_outputs[0]
outputs = self_attention_outputs[1:] # Keep self-attention outputs and relative position weights
if not self.is_decoder:
hidden_states = self.layer[1](hidden_states)
else:
cross_attention_outputs = self.layer[1](hidden_states,
cross_attention_outputs = self.layer[1](
hidden_states,
kv=encoder_hidden_states,
attention_mask=encoder_attention_mask,
position_bias=encoder_decoder_position_bias,
head_mask=head_mask)
head_mask=head_mask,
)
hidden_states = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:] # Keep cross-attention outputs and relative position weights
outputs = (
outputs + cross_attention_outputs[1:]
) # Keep cross-attention outputs and relative position weights
hidden_states = self.layer[2](hidden_states)
outputs = (hidden_states,) + outputs # add attentions if we output them
......@@ -441,6 +449,7 @@ class T5PreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = T5Config
pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
load_tf_weights = load_tf_weights_in_t5
......@@ -450,29 +459,31 @@ class T5PreTrainedModel(PreTrainedModel):
def dummy_inputs(self):
input_ids = torch.tensor(DUMMY_INPUTS)
input_mask = torch.tensor(DUMMY_MASK)
dummy_inputs = {'decoder_input_ids': input_ids,
'encoder_input_ids': input_ids,
'decoder_attention_mask': input_mask}
dummy_inputs = {
"decoder_input_ids": input_ids,
"encoder_input_ids": input_ids,
"decoder_attention_mask": input_mask,
}
return dummy_inputs
def _init_weights(self, module):
""" Initialize the weights """
factor = self.config.initializer_factor # Used for testing weights initialization
if isinstance(module, T5LayerNorm):
module.weight.data.fill_(factor*1.0)
module.weight.data.fill_(factor * 1.0)
elif isinstance(module, (T5Model, T5WithLMHeadModel)):
# Mesh TensorFlow embeddings initialization
# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
module.shared.weight.data.normal_(mean=0.0, std=factor*1.0)
module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
elif isinstance(module, T5DenseReluDense):
# Mesh TensorFlow FF initialization
# See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
# and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
module.wi.weight.data.normal_(mean=0.0, std=factor*((self.config.d_model) ** -0.5))
if hasattr(module.wi, 'bias') and module.wi.bias is not None:
module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
if hasattr(module.wi, "bias") and module.wi.bias is not None:
module.wi.bias.data.zero_()
module.wo.weight.data.normal_(mean=0.0, std=factor*((self.config.d_ff) ** -0.5))
if hasattr(module.wo, 'bias') and module.wo.bias is not None:
module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
if hasattr(module.wo, "bias") and module.wo.bias is not None:
module.wo.bias.data.zero_()
elif isinstance(module, T5Attention):
# Mesh TensorFlow attention initialization to avoid scaling before softmax
......@@ -480,12 +491,12 @@ class T5PreTrainedModel(PreTrainedModel):
d_model = self.config.d_model
d_kv = self.config.d_kv
n_heads = self.config.num_heads
module.q.weight.data.normal_(mean=0.0, std=factor*((d_model * d_kv) ** -0.5))
module.k.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
module.v.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
module.o.weight.data.normal_(mean=0.0, std=factor*((n_heads * d_kv) ** -0.5))
module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * d_kv) ** -0.5))
module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * d_kv) ** -0.5))
if module.has_relative_attention_bias:
module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor*((d_model) ** -0.5))
module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
class T5Stack(T5PreTrainedModel):
......@@ -495,19 +506,22 @@ class T5Stack(T5PreTrainedModel):
self.output_hidden_states = config.output_hidden_states
self.is_decoder = config.is_decoder
self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
for i in range(config.num_layers)])
self.block = nn.ModuleList(
[T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
)
self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
self.init_weights()
def forward(self,
def forward(
self,
hidden_states,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None):
head_mask=None,
):
batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
if attention_mask is None:
......@@ -557,7 +571,9 @@ class T5Stack(T5PreTrainedModel):
# Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
# encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2))
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
encoder_extended_attention_mask = encoder_extended_attention_mask.to(
dtype=next(self.parameters()).dtype
) # fp16 compatibility
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
else:
encoder_extended_attention_mask = None
......@@ -572,8 +588,12 @@ class T5Stack(T5PreTrainedModel):
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
head_mask = (
head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
) # We can specify head_mask for each layer
head_mask = head_mask.to(
dtype=next(self.parameters()).dtype
) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.num_layers
......@@ -587,13 +607,15 @@ class T5Stack(T5PreTrainedModel):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(hidden_states,
layer_outputs = layer_module(
hidden_states,
attention_mask=extended_attention_mask,
position_bias=position_bias,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
encoder_decoder_position_bias=encoder_decoder_position_bias,
head_mask=head_mask[i])
head_mask=head_mask[i],
)
# layer_outputs is a tuple with:
# hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
hidden_states = layer_outputs[0]
......@@ -672,9 +694,12 @@ T5_INPUTS_DOCSTRING = r"""
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
"without any specific head on top.",
T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.",
T5_START_DOCSTRING,
T5_INPUTS_DOCSTRING,
)
class T5Model(T5PreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -697,6 +722,7 @@ class T5Model(T5PreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config):
super(T5Model, self).__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.d_model)
......@@ -729,12 +755,13 @@ class T5Model(T5PreTrainedModel):
# `encoder_`), decoder-specific (prefixed by `decoder_`) and those
# that apply to the model as whole.
# We let the specific kwargs override the common ones in case of conflict.
kwargs_common = dict((k, v) for k, v in kwargs.items()
if not k.startswith("encoder_") and not k.startswith("decoder_"))
kwargs_common = dict(
(k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
)
kwargs_encoder = kwargs_common.copy()
kwargs_decoder = kwargs_common.copy()
kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
# Encode if needed (training, first prediction pass)
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
......@@ -770,8 +797,7 @@ class T5Model(T5PreTrainedModel):
return decoder_outputs + encoder_outputs
@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
class T5WithLMHeadModel(T5PreTrainedModel):
r"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......@@ -802,6 +828,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
loss, prediction_scores = outputs[:2]
"""
def __init__(self, config):
super(T5WithLMHeadModel, self).__init__(config)
self.model_dim = config.d_model
......@@ -834,14 +861,15 @@ class T5WithLMHeadModel(T5PreTrainedModel):
# that apply to the model as whole.
# We let the specific kwargs override the common ones in case of conflict.
lm_labels = kwargs.pop('decoder_lm_labels', None)
lm_labels = kwargs.pop("decoder_lm_labels", None)
kwargs_common = dict((k, v) for k, v in kwargs.items()
if not k.startswith("encoder_") and not k.startswith("decoder_"))
kwargs_common = dict(
(k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")
)
kwargs_encoder = kwargs_common.copy()
kwargs_decoder = kwargs_common.copy()
kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
# Encode if needed (training, first prediction pass)
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
......@@ -879,8 +907,9 @@ class T5WithLMHeadModel(T5PreTrainedModel):
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = lm_labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
decoder_outputs = (loss,) + decoder_outputs # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
decoder_outputs = (
loss,
) + decoder_outputs # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
return decoder_outputs + encoder_outputs
......@@ -22,23 +22,22 @@ import sys
import tensorflow as tf
from .configuration_albert import AlbertConfig
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
from .file_utils import add_start_docstrings
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
import logging
logger = logging.getLogger(__name__)
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5",
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5",
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5",
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5",
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
"albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5",
"albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5",
"albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5",
"albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5",
"albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
"albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
"albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
"albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
}
......@@ -50,21 +49,22 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
super(TFAlbertEmbeddings, self).__init__(**kwargs)
self.config = config
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
self.position_embeddings = tf.keras.layers.Embedding(
config.max_position_embeddings,
config.embedding_size,
embeddings_initializer=get_initializer(
self.config.initializer_range),
name='position_embeddings')
self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
embeddings_initializer=get_initializer(self.config.initializer_range),
name="position_embeddings",
)
self.token_type_embeddings = tf.keras.layers.Embedding(
config.type_vocab_size,
config.embedding_size,
embeddings_initializer=get_initializer(
self.config.initializer_range),
name='token_type_embeddings')
embeddings_initializer=get_initializer(self.config.initializer_range),
name="token_type_embeddings",
)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name='LayerNorm')
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def build(self, input_shape):
......@@ -75,7 +75,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
self.word_embeddings = self.add_weight(
"weight",
shape=[self.config.vocab_size, self.config.embedding_size],
initializer=get_initializer(self.config.initializer_range))
initializer=get_initializer(self.config.initializer_range),
)
super(TFAlbertEmbeddings, self).build(input_shape)
def call(self, inputs, mode="embedding", training=False):
......@@ -145,34 +146,29 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
)
self.output_attentions = config.output_attentions
self.num_attention_heads = config.num_attention_heads
assert config.hidden_size % config.num_attention_heads == 0
self.attention_head_size = int(
config.hidden_size / config.num_attention_heads)
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = tf.keras.layers.Dense(self.all_head_size,
kernel_initializer=get_initializer(
config.initializer_range),
name='query')
self.key = tf.keras.layers.Dense(self.all_head_size,
kernel_initializer=get_initializer(
config.initializer_range),
name='key')
self.value = tf.keras.layers.Dense(self.all_head_size,
kernel_initializer=get_initializer(
config.initializer_range),
name='value')
self.dropout = tf.keras.layers.Dropout(
config.attention_probs_dropout_prob)
self.query = tf.keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = tf.keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = tf.keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x, batch_size):
x = tf.reshape(
x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs, training=False):
......@@ -212,23 +208,21 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
context_layer = tf.matmul(attention_probs, value_layer)
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
context_layer = tf.reshape(context_layer,
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
context_layer = tf.reshape(
context_layer, (batch_size, -1, self.all_head_size)
) # (batch_size, seq_len_q, all_head_size)
outputs = (context_layer, attention_probs) if self.output_attentions else (
context_layer,)
outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
return outputs
class TFAlbertSelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFAlbertSelfOutput, self).__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size,
kernel_initializer=get_initializer(
config.initializer_range),
name='dense')
self.LayerNorm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name='LayerNorm')
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def call(self, inputs, training=False):
......@@ -245,12 +239,10 @@ class TFAlbertAttention(TFBertSelfAttention):
super(TFAlbertAttention, self).__init__(config, **kwargs)
self.hidden_size = config.hidden_size
self.dense = tf.keras.layers.Dense(config.hidden_size,
kernel_initializer=get_initializer(
config.initializer_range),
name='dense')
self.LayerNorm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name='LayerNorm')
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.pruned_heads = set()
def prune_heads(self, heads):
......@@ -293,11 +285,11 @@ class TFAlbertAttention(TFBertSelfAttention):
context_layer = tf.matmul(attention_probs, value_layer)
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
context_layer = tf.reshape(context_layer,
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
context_layer = tf.reshape(
context_layer, (batch_size, -1, self.all_head_size)
) # (batch_size, seq_len_q, all_head_size)
self_outputs = (context_layer, attention_probs) if self.output_attentions else (
context_layer,)
self_outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
hidden_states = self_outputs[0]
......@@ -313,34 +305,37 @@ class TFAlbertAttention(TFBertSelfAttention):
class TFAlbertLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFAlbertLayer, self).__init__(**kwargs)
self.attention = TFAlbertAttention(config, name='attention')
self.attention = TFAlbertAttention(config, name="attention")
self.ffn = tf.keras.layers.Dense(config.intermediate_size, kernel_initializer=get_initializer(
config.initializer_range), name='ffn')
self.ffn = tf.keras.layers.Dense(
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
)
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
if isinstance(config.hidden_act, str) or (
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
self.ffn_output = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
config.initializer_range), name='ffn_output')
self.ffn_output = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output"
)
self.full_layer_layer_norm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name='full_layer_layer_norm')
epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
)
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def call(self, inputs, training=False):
hidden_states, attention_mask, head_mask = inputs
attention_outputs = self.attention(
[hidden_states, attention_mask, head_mask], training=training)
attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
ffn_output = self.ffn(attention_outputs[0])
ffn_output = self.activation(ffn_output)
ffn_output = self.ffn_output(ffn_output)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.full_layer_layer_norm(
ffn_output + attention_outputs[0])
hidden_states = self.full_layer_layer_norm(ffn_output + attention_outputs[0])
# add attentions if we output them
outputs = (hidden_states,) + attention_outputs[1:]
......@@ -353,8 +348,9 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.albert_layers = [TFAlbertLayer(config, name="albert_layers_._{}".format(
i)) for i in range(config.inner_group_num)]
self.albert_layers = [
TFAlbertLayer(config, name="albert_layers_._{}".format(i)) for i in range(config.inner_group_num)
]
def call(self, inputs, training=False):
hidden_states, attention_mask, head_mask = inputs
......@@ -363,8 +359,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
layer_attentions = ()
for layer_index, albert_layer in enumerate(self.albert_layers):
layer_output = albert_layer(
[hidden_states, attention_mask, head_mask[layer_index]], training=training)
layer_output = albert_layer([hidden_states, attention_mask, head_mask[layer_index]], training=training)
hidden_states = layer_output[0]
if self.output_attentions:
......@@ -389,10 +384,15 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
self.config = config
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.embedding_hidden_mapping_in = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
config.initializer_range), name='embedding_hidden_mapping_in')
self.albert_layer_groups = [TFAlbertLayerGroup(
config, name="albert_layer_groups_._{}".format(i)) for i in range(config.num_hidden_groups)]
self.embedding_hidden_mapping_in = tf.keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="embedding_hidden_mapping_in",
)
self.albert_layer_groups = [
TFAlbertLayerGroup(config, name="albert_layer_groups_._{}".format(i))
for i in range(config.num_hidden_groups)
]
def call(self, inputs, training=False):
hidden_states, attention_mask, head_mask = inputs
......@@ -405,15 +405,19 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
for i in range(self.config.num_hidden_layers):
# Number of layers in a hidden group
layers_per_group = int(
self.config.num_hidden_layers / self.config.num_hidden_groups)
layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
# Index of the hidden group
group_idx = int(
i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
layer_group_output = self.albert_layer_groups[group_idx](
[hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group]], training=training)
[
hidden_states,
attention_mask,
head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
],
training=training,
)
hidden_states = layer_group_output[0]
if self.output_attentions:
......@@ -436,6 +440,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = AlbertConfig
pretrained_model_archive_map = TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "albert"
......@@ -446,31 +451,27 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
super(TFAlbertMLMHead, self).__init__(**kwargs)
self.vocab_size = config.vocab_size
self.dense = tf.keras.layers.Dense(config.embedding_size,
kernel_initializer=get_initializer(
config.initializer_range),
name='dense')
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
self.dense = tf.keras.layers.Dense(
config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str) or (
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name='LayerNorm')
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = input_embeddings
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,),
initializer='zeros',
trainable=True,
name='bias')
self.decoder_bias = self.add_weight(shape=(self.vocab_size,),
initializer='zeros',
trainable=True,
name='decoder/bias')
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
self.decoder_bias = self.add_weight(
shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias"
)
super(TFAlbertMLMHead, self).build(input_shape)
def call(self, hidden_states):
......@@ -560,8 +561,12 @@ ALBERT_INPUTS_DOCSTRING = r"""
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
ALBERT_START_DOCSTRING,
ALBERT_INPUTS_DOCSTRING,
)
class TFAlbertModel(TFAlbertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -601,8 +606,12 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
self.encoder = TFAlbertTransformer(config, name="encoder")
self.pooler = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
config.initializer_range), activation='tanh', name='pooler')
self.pooler = tf.keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
name="pooler",
)
def get_input_embeddings(self):
return self.embeddings
......@@ -617,7 +626,16 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
"""
raise NotImplementedError
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
def call(
self,
inputs,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=False,
):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
......@@ -627,12 +645,12 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
assert len(inputs) <= 6, "Too many inputs."
else:
input_ids = inputs
......@@ -672,16 +690,14 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if not head_mask is None:
if head_mask is not None:
raise NotImplementedError
else:
head_mask = [None] * self.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers)
embedding_output = self.embeddings(
[input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
encoder_outputs = self.encoder(
[embedding_output, extended_attention_mask, head_mask], training=training)
embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output[:, 0])
......@@ -692,8 +708,9 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
return outputs
@add_start_docstrings("""Albert Model with a `language modeling` head on top. """,
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING
)
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -723,9 +740,8 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
self.albert = TFAlbertModel(config, name='albert')
self.predictions = TFAlbertMLMHead(
config, self.albert.embeddings, name='predictions')
self.albert = TFAlbertModel(config, name="albert")
self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions")
def get_output_embeddings(self):
return self.albert.embeddings
......@@ -734,8 +750,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
outputs = self.albert(inputs, **kwargs)
sequence_output = outputs[0]
prediction_scores = self.predictions(
sequence_output, training=kwargs.get('training', False))
prediction_scores = self.predictions(sequence_output, training=kwargs.get("training", False))
# Add hidden states and attention if they are here
outputs = (prediction_scores,) + outputs[2:]
......@@ -743,9 +758,12 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
return outputs # prediction_scores, (hidden_states), (attentions)
@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
@add_start_docstrings(
"""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
ALBERT_START_DOCSTRING,
ALBERT_INPUTS_DOCSTRING,
)
class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -771,22 +789,23 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.albert = TFAlbertModel(config, name='albert')
self.albert = TFAlbertModel(config, name="albert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='classifier')
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
def call(self, inputs, **kwargs):
outputs = self.albert(inputs, **kwargs)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
......
......@@ -18,32 +18,77 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import logging
from .configuration_auto import (BertConfig, CTRLConfig, DistilBertConfig,
GPT2Config, OpenAIGPTConfig, RobertaConfig,
TransfoXLConfig, XLMConfig, XLNetConfig)
from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, \
TFBertForQuestionAnswering, TFBertForTokenClassification, TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \
TFXLNetForQuestionAnsweringSimple, TFXLNetForTokenClassification, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, \
TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, \
TFRobertaForTokenClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tf_albert import TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
from .file_utils import add_start_docstrings
from .configuration_auto import (
BertConfig,
CTRLConfig,
DistilBertConfig,
GPT2Config,
OpenAIGPTConfig,
RobertaConfig,
TransfoXLConfig,
XLMConfig,
XLNetConfig,
)
from .modeling_tf_albert import (
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
TFAlbertForMaskedLM,
TFAlbertForSequenceClassification,
TFAlbertModel,
)
from .modeling_tf_bert import (
TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
TFBertForMaskedLM,
TFBertForQuestionAnswering,
TFBertForSequenceClassification,
TFBertForTokenClassification,
TFBertModel,
)
from .modeling_tf_ctrl import TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, TFCTRLLMHeadModel, TFCTRLModel
from .modeling_tf_distilbert import (
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
TFDistilBertForMaskedLM,
TFDistilBertForQuestionAnswering,
TFDistilBertForSequenceClassification,
TFDistilBertForTokenClassification,
TFDistilBertModel,
)
from .modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, TFGPT2LMHeadModel, TFGPT2Model
from .modeling_tf_openai import TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, TFOpenAIGPTLMHeadModel, TFOpenAIGPTModel
from .modeling_tf_roberta import (
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
TFRobertaForMaskedLM,
TFRobertaForSequenceClassification,
TFRobertaForTokenClassification,
TFRobertaModel,
)
from .modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP, TFT5Model, TFT5WithLMHeadModel
from .modeling_tf_transfo_xl import (
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
TFTransfoXLLMHeadModel,
TFTransfoXLModel,
)
from .modeling_tf_xlm import (
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
TFXLMForQuestionAnsweringSimple,
TFXLMForSequenceClassification,
TFXLMModel,
TFXLMWithLMHeadModel,
)
from .modeling_tf_xlnet import (
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
TFXLNetForQuestionAnsweringSimple,
TFXLNetForSequenceClassification,
TFXLNetForTokenClassification,
TFXLNetLMHeadModel,
TFXLNetModel,
)
logger = logging.getLogger(__name__)
TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict(
(key, value)
for pretrained_map in [
TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
......@@ -57,7 +102,8 @@ TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
]
for key, value, in pretrained_map.items())
for key, value, in pretrained_map.items()
)
class TFAutoModel(object):
......@@ -85,10 +131,13 @@ class TFAutoModel(object):
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError("TFAutoModel is designed to be instantiated "
raise EnvironmentError(
"TFAutoModel is designed to be instantiated "
"using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` or "
"`TFAutoModel.from_config(config)` methods.")
"`TFAutoModel.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
......@@ -209,32 +258,34 @@ class TFAutoModel(object):
model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
"""
if 't5' in pretrained_model_name_or_path:
if "t5" in pretrained_model_name_or_path:
return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
elif "distilbert" in pretrained_model_name_or_path:
return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'albert' in pretrained_model_name_or_path:
elif "albert" in pretrained_model_name_or_path:
return TFAlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
elif "roberta" in pretrained_model_name_or_path:
return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
elif "bert" in pretrained_model_name_or_path:
return TFBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'openai-gpt' in pretrained_model_name_or_path:
elif "openai-gpt" in pretrained_model_name_or_path:
return TFOpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'gpt2' in pretrained_model_name_or_path:
elif "gpt2" in pretrained_model_name_or_path:
return TFGPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'transfo-xl' in pretrained_model_name_or_path:
elif "transfo-xl" in pretrained_model_name_or_path:
return TFTransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
elif "xlnet" in pretrained_model_name_or_path:
return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
elif "xlm" in pretrained_model_name_or_path:
return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'ctrl' in pretrained_model_name_or_path:
elif "ctrl" in pretrained_model_name_or_path:
return TFCTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
raise ValueError(
"Unrecognized model identifier in {}. Should contains one of "
"'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
"'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)
)
class TFAutoModelWithLMHead(object):
......@@ -262,10 +313,13 @@ class TFAutoModelWithLMHead(object):
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
raise EnvironmentError(
"TFAutoModelWithLMHead is designed to be instantiated "
"using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
"`TFAutoModelWithLMHead.from_config(config)` methods.")
"`TFAutoModelWithLMHead.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
......@@ -390,32 +444,34 @@ class TFAutoModelWithLMHead(object):
model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
"""
if 't5' in pretrained_model_name_or_path:
if "t5" in pretrained_model_name_or_path:
return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
elif "distilbert" in pretrained_model_name_or_path:
return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'albert' in pretrained_model_name_or_path:
elif "albert" in pretrained_model_name_or_path:
return TFAlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
elif "roberta" in pretrained_model_name_or_path:
return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
elif "bert" in pretrained_model_name_or_path:
return TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'openai-gpt' in pretrained_model_name_or_path:
elif "openai-gpt" in pretrained_model_name_or_path:
return TFOpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'gpt2' in pretrained_model_name_or_path:
elif "gpt2" in pretrained_model_name_or_path:
return TFGPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'transfo-xl' in pretrained_model_name_or_path:
elif "transfo-xl" in pretrained_model_name_or_path:
return TFTransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
elif "xlnet" in pretrained_model_name_or_path:
return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
elif "xlm" in pretrained_model_name_or_path:
return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'ctrl' in pretrained_model_name_or_path:
elif "ctrl" in pretrained_model_name_or_path:
return TFCTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
raise ValueError(
"Unrecognized model identifier in {}. Should contains one of "
"'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
"'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)
)
class TFAutoModelForSequenceClassification(object):
......@@ -438,10 +494,13 @@ class TFAutoModelForSequenceClassification(object):
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError("TFAutoModelForSequenceClassification is designed to be instantiated "
raise EnvironmentError(
"TFAutoModelForSequenceClassification is designed to be instantiated "
"using the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
"`TFAutoModelForSequenceClassification.from_config(config)` methods.")
"`TFAutoModelForSequenceClassification.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
......@@ -552,21 +611,33 @@ class TFAutoModelForSequenceClassification(object):
model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
"""
if 'distilbert' in pretrained_model_name_or_path:
return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'albert' in pretrained_model_name_or_path:
return TFAlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
return TFBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
return TFXLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
if "distilbert" in pretrained_model_name_or_path:
return TFDistilBertForSequenceClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "albert" in pretrained_model_name_or_path:
return TFAlbertForSequenceClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "roberta" in pretrained_model_name_or_path:
return TFRobertaForSequenceClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "bert" in pretrained_model_name_or_path:
return TFBertForSequenceClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "xlnet" in pretrained_model_name_or_path:
return TFXLNetForSequenceClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "xlm" in pretrained_model_name_or_path:
return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'distilbert', 'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
raise ValueError(
"Unrecognized model identifier in {}. Should contains one of "
"'distilbert', 'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path)
)
class TFAutoModelForQuestionAnswering(object):
......@@ -588,10 +659,13 @@ class TFAutoModelForQuestionAnswering(object):
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError("TFAutoModelForQuestionAnswering is designed to be instantiated "
raise EnvironmentError(
"TFAutoModelForQuestionAnswering is designed to be instantiated "
"using the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
"`TFAutoModelForQuestionAnswering.from_config(config)` methods.")
"`TFAutoModelForQuestionAnswering.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
......@@ -615,9 +689,9 @@ class TFAutoModelForQuestionAnswering(object):
elif isinstance(config, BertConfig):
return TFBertForQuestionAnswering(config)
elif isinstance(config, XLNetConfig):
return TFXLNetForQuestionAnswering(config)
raise NotImplementedError("TFXLNetForQuestionAnswering isn't implemented")
elif isinstance(config, XLMConfig):
return TFXLMForQuestionAnswering(config)
raise NotImplementedError("TFXLMForQuestionAnswering isn't implemented")
raise ValueError("Unrecognized configuration class {}".format(config))
@classmethod
......@@ -698,24 +772,34 @@ class TFAutoModelForQuestionAnswering(object):
model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
"""
if 'distilbert' in pretrained_model_name_or_path:
return TFDistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
if "distilbert" in pretrained_model_name_or_path:
return TFDistilBertForQuestionAnswering.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "bert" in pretrained_model_name_or_path:
return TFBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
return TFXLNetForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif "xlnet" in pretrained_model_name_or_path:
return TFXLNetForQuestionAnsweringSimple.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "xlm" in pretrained_model_name_or_path:
return TFXLMForQuestionAnsweringSimple.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
raise ValueError(
"Unrecognized model identifier in {}. Should contains one of "
"'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path)
)
class TFAutoModelForTokenClassification:
def __init__(self):
raise EnvironmentError("TFAutoModelForTokenClassification is designed to be instantiated "
raise EnvironmentError(
"TFAutoModelForTokenClassification is designed to be instantiated "
"using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
"`AutoModelForTokenClassification.from_config(config)` methods.")
"`AutoModelForTokenClassification.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
......@@ -815,14 +899,20 @@ class TFAutoModelForTokenClassification:
model = TFAutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
if 'bert' in pretrained_model_name_or_path:
if "bert" in pretrained_model_name_or_path:
return TFBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
elif "xlnet" in pretrained_model_name_or_path:
return TFXLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
return TFDistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
return TFRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'xlnet', 'distilbert', 'roberta'".format(pretrained_model_name_or_path))
elif "distilbert" in pretrained_model_name_or_path:
return TFDistilBertForTokenClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif "roberta" in pretrained_model_name_or_path:
return TFRobertaForTokenClassification.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
raise ValueError(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'xlnet', 'distilbert', 'roberta'".format(pretrained_model_name_or_path)
)
......@@ -17,43 +17,40 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import math
import os
import sys
from io import open
import numpy as np
import tensorflow as tf
from .configuration_bert import BertConfig
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
from .file_utils import add_start_docstrings
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
logger = logging.getLogger(__name__)
TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
"bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
"bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
"bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
"bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
"bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
"bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
"bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
"bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
"bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
"bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
"bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
"bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
"bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
"bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
"bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
"bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
"bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
"bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
"bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
}
......@@ -67,6 +64,7 @@ def gelu(x):
cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
return x * cdf
def gelu_new(x):
"""Gaussian Error Linear Unit.
This is a smoother version of the RELU.
......@@ -76,41 +74,48 @@ def gelu_new(x):
Returns:
`x` with the GELU activation applied.
"""
cdf = 0.5 * (1.0 + tf.tanh(
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
def swish(x):
return x * tf.sigmoid(x)
ACT2FN = {"gelu": tf.keras.layers.Activation(gelu),
ACT2FN = {
"gelu": tf.keras.layers.Activation(gelu),
"relu": tf.keras.activations.relu,
"swish": tf.keras.layers.Activation(swish),
"gelu_new": tf.keras.layers.Activation(gelu_new)}
"gelu_new": tf.keras.layers.Activation(gelu_new),
}
class TFBertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config, **kwargs):
super(TFBertEmbeddings, self).__init__(**kwargs)
self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.initializer_range = config.initializer_range
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
self.position_embeddings = tf.keras.layers.Embedding(
config.max_position_embeddings,
config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range),
name='position_embeddings')
self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
name="position_embeddings",
)
self.token_type_embeddings = tf.keras.layers.Embedding(
config.type_vocab_size,
config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range),
name='token_type_embeddings')
name="token_type_embeddings",
)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def build(self, input_shape):
......@@ -121,7 +126,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range))
initializer=get_initializer(self.initializer_range),
)
super(TFBertEmbeddings, self).build(input_shape)
def call(self, inputs, mode="embedding", training=False):
......@@ -193,7 +199,8 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
)
self.output_attentions = config.output_attentions
self.num_attention_heads = config.num_attention_heads
......@@ -201,15 +208,15 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = tf.keras.layers.Dense(self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name='query')
self.key = tf.keras.layers.Dense(self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name='key')
self.value = tf.keras.layers.Dense(self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
name='value')
self.query = tf.keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = tf.keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = tf.keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
......@@ -230,7 +237,9 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # (batch size, num_heads, seq_len_q, seq_len_k)
attention_scores = tf.matmul(
query_layer, key_layer, transpose_b=True
) # (batch size, num_heads, seq_len_q, seq_len_k)
dk = tf.cast(shape_list(key_layer)[-1], tf.float32) # scale attention_scores
attention_scores = attention_scores / tf.math.sqrt(dk)
......@@ -252,8 +261,9 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
context_layer = tf.matmul(attention_probs, value_layer)
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
context_layer = tf.reshape(context_layer,
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
context_layer = tf.reshape(
context_layer, (batch_size, -1, self.all_head_size)
) # (batch_size, seq_len_q, all_head_size)
outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
return outputs
......@@ -262,10 +272,10 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
class TFBertSelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFBertSelfOutput, self).__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name='dense')
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def call(self, inputs, training=False):
......@@ -280,8 +290,8 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
class TFBertAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFBertAttention, self).__init__(**kwargs)
self.self_attention = TFBertSelfAttention(config, name='self')
self.dense_output = TFBertSelfOutput(config, name='output')
self.self_attention = TFBertSelfAttention(config, name="self")
self.dense_output = TFBertSelfOutput(config, name="output")
def prune_heads(self, heads):
raise NotImplementedError
......@@ -298,10 +308,12 @@ class TFBertAttention(tf.keras.layers.Layer):
class TFBertIntermediate(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFBertIntermediate, self).__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.intermediate_size,
kernel_initializer=get_initializer(config.initializer_range),
name='dense')
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
self.dense = tf.keras.layers.Dense(
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str) or (
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
......@@ -315,10 +327,10 @@ class TFBertIntermediate(tf.keras.layers.Layer):
class TFBertOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFBertOutput, self).__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name='dense')
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def call(self, inputs, training=False):
......@@ -333,9 +345,9 @@ class TFBertOutput(tf.keras.layers.Layer):
class TFBertLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFBertLayer, self).__init__(**kwargs)
self.attention = TFBertAttention(config, name='attention')
self.intermediate = TFBertIntermediate(config, name='intermediate')
self.bert_output = TFBertOutput(config, name='output')
self.attention = TFBertAttention(config, name="attention")
self.intermediate = TFBertIntermediate(config, name="intermediate")
self.bert_output = TFBertOutput(config, name="output")
def call(self, inputs, training=False):
hidden_states, attention_mask, head_mask = inputs
......@@ -353,7 +365,7 @@ class TFBertEncoder(tf.keras.layers.Layer):
super(TFBertEncoder, self).__init__(**kwargs)
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.layer = [TFBertLayer(config, name='layer_._{}'.format(i)) for i in range(config.num_hidden_layers)]
self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
def call(self, inputs, training=False):
hidden_states, attention_mask, head_mask = inputs
......@@ -385,10 +397,12 @@ class TFBertEncoder(tf.keras.layers.Layer):
class TFBertPooler(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFBertPooler, self).__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size,
self.dense = tf.keras.layers.Dense(
config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation='tanh',
name='dense')
activation="tanh",
name="dense",
)
def call(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
......@@ -401,14 +415,16 @@ class TFBertPooler(tf.keras.layers.Layer):
class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFBertPredictionHeadTransform, self).__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name='dense')
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str) or (
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm')
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
def call(self, hidden_states):
hidden_states = self.dense(hidden_states)
......@@ -421,17 +437,14 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
super(TFBertLMPredictionHead, self).__init__(**kwargs)
self.vocab_size = config.vocab_size
self.transform = TFBertPredictionHeadTransform(config, name='transform')
self.transform = TFBertPredictionHeadTransform(config, name="transform")
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.input_embeddings = input_embeddings
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,),
initializer='zeros',
trainable=True,
name='bias')
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super(TFBertLMPredictionHead, self).build(input_shape)
def call(self, hidden_states):
......@@ -444,7 +457,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
class TFBertMLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
super(TFBertMLMHead, self).__init__(**kwargs)
self.predictions = TFBertLMPredictionHead(config, input_embeddings, name='predictions')
self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
def call(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
......@@ -454,9 +467,9 @@ class TFBertMLMHead(tf.keras.layers.Layer):
class TFBertNSPHead(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFBertNSPHead, self).__init__(**kwargs)
self.seq_relationship = tf.keras.layers.Dense(2,
kernel_initializer=get_initializer(config.initializer_range),
name='seq_relationship')
self.seq_relationship = tf.keras.layers.Dense(
2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
)
def call(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
......@@ -468,9 +481,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
super(TFBertMainLayer, self).__init__(**kwargs)
self.num_hidden_layers = config.num_hidden_layers
self.embeddings = TFBertEmbeddings(config, name='embeddings')
self.encoder = TFBertEncoder(config, name='encoder')
self.pooler = TFBertPooler(config, name='pooler')
self.embeddings = TFBertEmbeddings(config, name="embeddings")
self.encoder = TFBertEncoder(config, name="encoder")
self.pooler = TFBertPooler(config, name="pooler")
def get_input_embeddings(self):
return self.embeddings
......@@ -485,7 +498,16 @@ class TFBertMainLayer(tf.keras.layers.Layer):
"""
raise NotImplementedError
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
def call(
self,
inputs,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=False,
):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
......@@ -495,12 +517,12 @@ class TFBertMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
assert len(inputs) <= 6, "Too many inputs."
else:
input_ids = inputs
......@@ -540,7 +562,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if not head_mask is None:
if head_mask is not None:
raise NotImplementedError
else:
head_mask = [None] * self.num_hidden_layers
......@@ -552,7 +574,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output)
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here
outputs = (sequence_output, pooled_output,) + encoder_outputs[
1:
] # add hidden_states and attentions if they are here
return outputs # sequence_output, pooled_output, (hidden_states), (attentions)
......@@ -560,6 +584,7 @@ class TFBertPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = BertConfig
pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "bert"
......@@ -648,8 +673,12 @@ BERT_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING,
)
class TFBertModel(TFBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -682,18 +711,22 @@ class TFBertModel(TFBertPreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFBertModel, self).__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name='bert')
self.bert = TFBertMainLayer(config, name="bert")
def call(self, inputs, **kwargs):
outputs = self.bert(inputs, **kwargs)
return outputs
@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
@add_start_docstrings(
"""Bert Model with two heads on top as done during the pre-training:
a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING,
)
class TFBertForPreTraining(TFBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -721,12 +754,13 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
prediction_scores, seq_relationship_scores = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFBertForPreTraining, self).__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name='bert')
self.nsp = TFBertNSPHead(config, name='nsp___cls')
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
self.bert = TFBertMainLayer(config, name="bert")
self.nsp = TFBertNSPHead(config, name="nsp___cls")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
def get_output_embeddings(self):
return self.bert.embeddings
......@@ -735,16 +769,19 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
outputs = self.bert(inputs, **kwargs)
sequence_output, pooled_output = outputs[:2]
prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
seq_relationship_score = self.nsp(pooled_output)
outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here
outputs = (prediction_scores, seq_relationship_score,) + outputs[
2:
] # add hidden states and attention if they are here
return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING
)
class TFBertForMaskedLM(TFBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -770,11 +807,12 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
prediction_scores = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name='bert')
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls')
self.bert = TFBertMainLayer(config, name="bert")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
def get_output_embeddings(self):
return self.bert.embeddings
......@@ -783,15 +821,18 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
outputs = self.bert(inputs, **kwargs)
sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
return outputs # prediction_scores, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"""Bert Model with a `next sentence prediction (classification)` head on top. """,
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING,
)
class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -817,11 +858,12 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
seq_relationship_scores = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFBertForNextSentencePrediction, self).__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name='bert')
self.nsp = TFBertNSPHead(config, name='nsp___cls')
self.bert = TFBertMainLayer(config, name="bert")
self.nsp = TFBertNSPHead(config, name="nsp___cls")
def call(self, inputs, **kwargs):
outputs = self.bert(inputs, **kwargs)
......@@ -834,9 +876,12 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
return outputs # seq_relationship_score, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
@add_start_docstrings(
"""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING,
)
class TFBertForSequenceClassification(TFBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -862,22 +907,23 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, name='bert')
self.bert = TFBertMainLayer(config, name="bert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='classifier')
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
def call(self, inputs, **kwargs):
outputs = self.bert(inputs, **kwargs)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
......@@ -885,9 +931,12 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
return outputs # logits, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
@add_start_docstrings(
"""Bert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING,
)
class TFBertForMultipleChoice(TFBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -915,16 +964,26 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
classification_scores = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFBertForMultipleChoice, self).__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name='bert')
self.bert = TFBertMainLayer(config, name="bert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(1,
kernel_initializer=get_initializer(config.initializer_range),
name='classifier')
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
def call(
self,
inputs,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=False,
):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
......@@ -934,12 +993,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
assert len(inputs) <= 6, "Too many inputs."
else:
input_ids = inputs
......@@ -956,7 +1015,14 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
flat_inputs = [
flat_input_ids,
flat_attention_mask,
flat_token_type_ids,
flat_position_ids,
head_mask,
inputs_embeds,
]
outputs = self.bert(flat_inputs, training=training)
......@@ -971,9 +1037,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
return outputs # reshaped_logits, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
@add_start_docstrings(
"""Bert Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING,
)
class TFBertForTokenClassification(TFBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -999,22 +1068,23 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
scores = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, name='bert')
self.bert = TFBertMainLayer(config, name="bert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='classifier')
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
def call(self, inputs, **kwargs):
outputs = self.bert(inputs, **kwargs)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
......@@ -1022,9 +1092,12 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
return outputs # scores, (hidden_states), (attentions)
@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
@add_start_docstrings(
"""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
BERT_START_DOCSTRING,
BERT_INPUTS_DOCSTRING,
)
class TFBertForQuestionAnswering(TFBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -1052,14 +1125,15 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
start_scores, end_scores = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, name='bert')
self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='qa_outputs')
self.bert = TFBertMainLayer(config, name="bert")
self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
def call(self, inputs, **kwargs):
outputs = self.bert(inputs, **kwargs)
......
......@@ -18,29 +18,28 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os
import sys
from io import open
import numpy as np
import tensorflow as tf
from .configuration_ctrl import CTRLConfig
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list, TFSharedEmbeddings
from .file_utils import add_start_docstrings
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
logger = logging.getLogger(__name__)
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.huggingface.co/bert/ctrl-tf_model.h5"}
def angle_defn(pos, i, d_model_size):
angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model_size))
angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model_size))
return pos * angle_rates
def positional_encoding(position, d_model_size):
# create the sinusoidal pattern for the positional encoding
angle_rads = angle_defn(np.arange(position)[:, np.newaxis],
np.arange(d_model_size)[np.newaxis, :],
d_model_size)
angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size)
sines = np.sin(angle_rads[:, 0::2])
cosines = np.cos(angle_rads[:, 1::2])
......@@ -49,6 +48,7 @@ def positional_encoding(position, d_model_size):
pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1), dtype=tf.float32)
return pos_encoding
def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
# calculate attention
matmul_qk = tf.matmul(q, k, transpose_b=True)
......@@ -57,7 +57,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
if mask is not None:
scaled_attention_logits += (mask * -1e4)
scaled_attention_logits += mask * -1e4
if attention_mask is not None:
# Apply the attention mask
......@@ -83,11 +83,11 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
self.depth = int(d_model_size / self.num_heads)
self.Wq = tf.keras.layers.Dense(d_model_size, name='Wq')
self.Wk = tf.keras.layers.Dense(d_model_size, name='Wk')
self.Wv = tf.keras.layers.Dense(d_model_size, name='Wv')
self.Wq = tf.keras.layers.Dense(d_model_size, name="Wq")
self.Wk = tf.keras.layers.Dense(d_model_size, name="Wk")
self.Wv = tf.keras.layers.Dense(d_model_size, name="Wv")
self.dense = tf.keras.layers.Dense(d_model_size, name='dense')
self.dense = tf.keras.layers.Dense(d_model_size, name="dense")
def split_into_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
......@@ -122,22 +122,22 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
return outputs
def point_wise_feed_forward_network(d_model_size, dff, name=""):
return tf.keras.Sequential([
tf.keras.layers.Dense(dff, activation='relu', name="0"),
tf.keras.layers.Dense(d_model_size, name="2")
], name="ffn")
return tf.keras.Sequential(
[tf.keras.layers.Dense(dff, activation="relu", name="0"), tf.keras.layers.Dense(d_model_size, name="2")],
name="ffn",
)
class TFEncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs):
def __init__(
self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
):
super(TFEncoderLayer, self).__init__(**kwargs)
self.multi_head_attention = TFMultiHeadAttention(d_model_size,
num_heads,
output_attentions,
name="multi_head_attention")
self.multi_head_attention = TFMultiHeadAttention(
d_model_size, num_heads, output_attentions, name="multi_head_attention"
)
self.ffn = point_wise_feed_forward_network(d_model_size, dff, name="ffn")
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
......@@ -149,8 +149,9 @@ class TFEncoderLayer(tf.keras.layers.Layer):
def call(self, inputs, training=False):
x, mask, layer_past, attention_mask, head_mask = inputs
normed = self.layernorm1(x)
attn_outputs = self.multi_head_attention([normed, normed, normed, mask, layer_past,
attention_mask, head_mask], training=training)
attn_outputs = self.multi_head_attention(
[normed, normed, normed, mask, layer_past, attention_mask, head_mask], training=training
)
attn_output = attn_outputs[0]
attn_output = self.dropout1(attn_output, training=training)
out1 = x + attn_output
......@@ -176,20 +177,23 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)
self.w = TFSharedEmbeddings(config.vocab_size,
config.n_embd,
initializer_range=config.initializer_range,
name="w")
self.w = TFSharedEmbeddings(
config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="w"
)
self.dropout = tf.keras.layers.Dropout(config.embd_pdrop)
self.h = [TFEncoderLayer(config.n_embd,
self.h = [
TFEncoderLayer(
config.n_embd,
config.n_head,
config.dff,
config.resid_pdrop,
config.layer_norm_epsilon,
config.output_attentions,
name='h_._{}'.format(i)) for i in range(config.n_layer)]
name="h_._{}".format(i),
)
for i in range(config.n_layer)
]
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
def get_input_embeddings(self):
......@@ -204,7 +208,17 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
"""
raise NotImplementedError
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
def call(
self,
inputs,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=False,
):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
past = inputs[1] if len(inputs) > 1 else past
......@@ -215,13 +229,13 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
assert len(inputs) <= 7, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
past = inputs.get('past', past)
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
input_ids = inputs.get("input_ids")
past = inputs.get("past", past)
attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
assert len(inputs) <= 7, "Too many inputs."
else:
input_ids = inputs
......@@ -276,14 +290,14 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
if token_type_ids is not None:
token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
token_type_embeds = self.w(token_type_ids, mode='embedding')
token_type_embeds = self.w(token_type_ids, mode="embedding")
token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32))
else:
token_type_embeds = 0
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
if inputs_embeds is None:
inputs_embeds = self.w(input_ids, mode='embedding')
inputs_embeds = self.w(input_ids, mode="embedding")
seq_len = input_shape[-1]
mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
......@@ -333,6 +347,7 @@ class TFCTRLPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = CTRLConfig
pretrained_model_archive_map = TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "transformer"
......@@ -392,8 +407,12 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs:
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
CTRL_START_DOCSTRING,
CTRL_INPUTS_DOCSTRING,
)
class TFCTRLModel(TFCTRLPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -423,9 +442,10 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFCTRLModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFCTRLMainLayer(config, name='transformer')
self.transformer = TFCTRLMainLayer(config, name="transformer")
def call(self, inputs, **kwargs):
outputs = self.transformer(inputs, **kwargs)
......@@ -442,10 +462,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
self.input_embeddings = input_embeddings
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,),
initializer='zeros',
trainable=True,
name='bias')
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super(TFCTRLLMHead, self).build(input_shape)
def call(self, hidden_states):
......@@ -454,8 +471,12 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
return hidden_states
@add_start_docstrings("""The CTRL Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
@add_start_docstrings(
"""The CTRL Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
CTRL_START_DOCSTRING,
CTRL_INPUTS_DOCSTRING,
)
class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -486,9 +507,10 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
loss, logits = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFCTRLLMHeadModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFCTRLMainLayer(config, name='transformer')
self.transformer = TFCTRLMainLayer(config, name="transformer")
self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
......
......@@ -16,33 +16,28 @@
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import math
import copy
import sys
from io import open
import itertools
import numpy as np
import tensorflow as tf
from .configuration_distilbert import DistilBertConfig
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer
from .file_utils import add_start_docstrings
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, get_initializer, shape_list
logger = logging.getLogger(__name__)
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
"distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
"distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
"distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
}
### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
def gelu(x):
""" Gaussian Error Linear Unit.
Original Implementation of the gelu activation function in Google Bert repo when initially created.
......@@ -53,6 +48,7 @@ def gelu(x):
cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
return x * cdf
def gelu_new(x):
"""Gaussian Error Linear Unit.
This is a smoother version of the RELU.
......@@ -62,24 +58,25 @@ def gelu_new(x):
Returns:
`x` with the GELU activation applied.
"""
cdf = 0.5 * (1.0 + tf.tanh(
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
class TFEmbeddings(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFEmbeddings, self).__init__(**kwargs)
self.vocab_size = config.vocab_size
self.dim = config.dim
self.initializer_range = config.initializer_range
self.word_embeddings = TFSharedEmbeddings(config.vocab_size,
config.dim,
initializer_range=config.initializer_range,
name='word_embeddings') # padding_idx=0)
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
self.word_embeddings = TFSharedEmbeddings(
config.vocab_size, config.dim, initializer_range=config.initializer_range, name="word_embeddings"
) # padding_idx=0)
self.position_embeddings = tf.keras.layers.Embedding(
config.max_position_embeddings,
config.dim,
embeddings_initializer=get_initializer(config.initializer_range),
name='position_embeddings')
name="position_embeddings",
)
if config.sinusoidal_pos_embds:
raise NotImplementedError
......@@ -92,9 +89,8 @@ class TFEmbeddings(tf.keras.layers.Layer):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.dim],
initializer=get_initializer(self.initializer_range))
"weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range)
)
super(TFEmbeddings, self).build(input_shape)
def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
......@@ -181,18 +177,18 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
assert self.dim % self.n_heads == 0
self.q_lin = tf.keras.layers.Dense(config.dim,
kernel_initializer=get_initializer(config.initializer_range),
name="q_lin")
self.k_lin = tf.keras.layers.Dense(config.dim,
kernel_initializer=get_initializer(config.initializer_range),
name="k_lin")
self.v_lin = tf.keras.layers.Dense(config.dim,
kernel_initializer=get_initializer(config.initializer_range),
name="v_lin")
self.out_lin = tf.keras.layers.Dense(config.dim,
kernel_initializer=get_initializer(config.initializer_range),
name="out_lin")
self.q_lin = tf.keras.layers.Dense(
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin"
)
self.k_lin = tf.keras.layers.Dense(
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin"
)
self.v_lin = tf.keras.layers.Dense(
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin"
)
self.out_lin = tf.keras.layers.Dense(
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin"
)
self.pruned_heads = set()
......@@ -259,18 +255,23 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
else:
return (context,)
class TFFFN(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFFFN, self).__init__(**kwargs)
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.lin1 = tf.keras.layers.Dense(config.hidden_dim,
kernel_initializer=get_initializer(config.initializer_range),
name="lin1")
self.lin2 = tf.keras.layers.Dense(config.dim,
kernel_initializer=get_initializer(config.initializer_range),
name="lin2")
assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu
self.lin1 = tf.keras.layers.Dense(
config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1"
)
self.lin2 = tf.keras.layers.Dense(
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2"
)
assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format(
config.activation
)
self.activation = (
tf.keras.layers.Activation(gelu) if config.activation == "gelu" else tf.keras.activations.relu
)
def call(self, input, training=False):
x = self.lin1(input)
......@@ -341,8 +342,7 @@ class TFTransformer(tf.keras.layers.Layer):
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.layer = [TFTransformerBlock(config, name='layer_._{}'.format(i))
for i in range(config.n_layers)]
self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)]
def call(self, inputs, training=False):
"""
......@@ -421,10 +421,10 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
assert len(inputs) <= 4, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
assert len(inputs) <= 4, "Too many inputs."
else:
input_ids = inputs
......@@ -458,11 +458,12 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class TFDistilBertPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""
config_class = DistilBertConfig
pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "distilbert"
......@@ -534,8 +535,12 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
DISTILBERT_START_DOCSTRING,
DISTILBERT_INPUTS_DOCSTRING,
)
class TFDistilBertModel(TFDistilBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -561,6 +566,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs)
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
......@@ -580,10 +586,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
self.input_embeddings = input_embeddings
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,),
initializer='zeros',
trainable=True,
name='bias')
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super(TFDistilBertLMHead, self).build(input_shape)
def call(self, hidden_states):
......@@ -592,8 +595,11 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
return hidden_states
@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
@add_start_docstrings(
"""DistilBert Model with a `masked language modeling` head on top. """,
DISTILBERT_START_DOCSTRING,
DISTILBERT_INPUTS_DOCSTRING,
)
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -619,6 +625,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
prediction_scores = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
self.output_attentions = config.output_attentions
......@@ -626,9 +633,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
self.vocab_size = config.vocab_size
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
self.vocab_transform = tf.keras.layers.Dense(config.dim,
kernel_initializer=get_initializer(config.initializer_range),
name="vocab_transform")
self.vocab_transform = tf.keras.layers.Dense(
config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform"
)
self.act = tf.keras.layers.Activation(gelu)
self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
......@@ -649,9 +656,12 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
return outputs # logits, (hidden_states), (attentions)
@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
@add_start_docstrings(
"""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
DISTILBERT_START_DOCSTRING,
DISTILBERT_INPUTS_DOCSTRING,
)
class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -677,18 +687,21 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
self.pre_classifier = tf.keras.layers.Dense(config.dim,
kernel_initializer=get_initializer(config.initializer_range),
activation='relu',
name="pre_classifier")
self.classifier = tf.keras.layers.Dense(config.num_labels,
self.pre_classifier = tf.keras.layers.Dense(
config.dim,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier")
activation="relu",
name="pre_classifier",
)
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
def call(self, inputs, **kwargs):
......@@ -697,16 +710,19 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
pooled_output = hidden_state[:, 0] # (bs, dim)
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False)) # (bs, dim)
pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) # (bs, dim)
logits = self.classifier(pooled_output) # (bs, dim)
outputs = (logits,) + distilbert_output[1:]
return outputs # logits, (hidden_states), (attentions)
@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
@add_start_docstrings(
"""DistilBert Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
DISTILBERT_START_DOCSTRING,
DISTILBERT_INPUTS_DOCSTRING,
)
class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -728,22 +744,23 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
outputs = model(input_ids)
scores = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.distilbert = TFDistilBertMainLayer(config, name='distilbert')
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.classifier = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='classifier')
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
def call(self, inputs, **kwargs):
outputs = self.distilbert(inputs, **kwargs)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
......@@ -751,9 +768,12 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
return outputs # scores, (hidden_states), (attentions)
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
@add_start_docstrings(
"""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
DISTILBERT_START_DOCSTRING,
DISTILBERT_INPUTS_DOCSTRING,
)
class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -781,13 +801,14 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
start_scores, end_scores = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='qa_outputs')
self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
assert config.num_labels == 2
self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
......@@ -795,7 +816,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
distilbert_output = self.distilbert(inputs, **kwargs)
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
hidden_states = self.dropout(hidden_states, training=kwargs.get('training', False)) # (bs, max_query_len, dim)
hidden_states = self.dropout(hidden_states, training=kwargs.get("training", False)) # (bs, max_query_len, dim)
logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
......
......@@ -17,28 +17,31 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import json
import logging
import math
import os
import sys
from io import open
import numpy as np
import tensorflow as tf
from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
TFSequenceSummary, shape_list, get_initializer)
from .configuration_gpt2 import GPT2Config
from .file_utils import add_start_docstrings
from .modeling_tf_utils import (
TFConv1D,
TFPreTrainedModel,
TFSequenceSummary,
TFSharedEmbeddings,
get_initializer,
shape_list,
)
logger = logging.getLogger(__name__)
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {
"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5",
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",}
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",
}
def gelu(x):
......@@ -50,8 +53,7 @@ def gelu(x):
Returns:
`x` with the GELU activation applied.
"""
cdf = 0.5 * (1.0 + tf.tanh(
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
......@@ -68,8 +70,8 @@ class TFAttention(tf.keras.layers.Layer):
self.split_size = n_state
self.scale = scale
self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
self.pruned_heads = set()
......@@ -82,7 +84,7 @@ class TFAttention(tf.keras.layers.Layer):
"""1's in the lower triangle, counting from the lower right corner.
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
"""
i = tf.range(nd)[:,None]
i = tf.range(nd)[:, None]
j = tf.range(ns)
m = i >= j - ns + nd
return tf.cast(m, dtype)
......@@ -158,8 +160,8 @@ class TFMLP(tf.keras.layers.Layer):
def __init__(self, n_state, config, **kwargs):
super(TFMLP, self).__init__(**kwargs)
nx = config.n_embd
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
self.act = gelu
self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
......@@ -174,10 +176,10 @@ class TFBlock(tf.keras.layers.Layer):
def __init__(self, n_ctx, config, scale=False, **kwargs):
super(TFBlock, self).__init__(**kwargs)
nx = config.n_embd
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
self.mlp = TFMLP(4 * nx, config, name='mlp')
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
self.mlp = TFMLP(4 * nx, config, name="mlp")
def call(self, inputs, training=False):
x, layer_past, attention_mask, head_mask = inputs
......@@ -204,20 +206,18 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
self.vocab_size = config.vocab_size
self.n_embd = config.n_embd
self.wte = TFSharedEmbeddings(config.vocab_size,
config.hidden_size,
initializer_range=config.initializer_range,
name='wte')
self.wpe = tf.keras.layers.Embedding(config.n_positions,
self.wte = TFSharedEmbeddings(
config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
)
self.wpe = tf.keras.layers.Embedding(
config.n_positions,
config.n_embd,
embeddings_initializer=get_initializer(config.initializer_range),
name='wpe')
name="wpe",
)
self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
self.h = [TFBlock(config.n_ctx,
config,
scale=True,
name='h_._{}'.format(i)) for i in range(config.n_layer)]
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)]
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
def get_input_embeddings(self):
return self.wte
......@@ -231,7 +231,17 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
"""
raise NotImplementedError
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
def call(
self,
inputs,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=False,
):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
past = inputs[1] if len(inputs) > 1 else past
......@@ -242,13 +252,13 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
assert len(inputs) <= 7, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
past = inputs.get('past', past)
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
input_ids = inputs.get("input_ids")
past = inputs.get("past", past)
attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
assert len(inputs) <= 7, "Too many inputs."
else:
input_ids = inputs
......@@ -295,7 +305,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if not head_mask is None:
if head_mask is not None:
raise NotImplementedError
else:
head_mask = [None] * self.num_hidden_layers
......@@ -304,11 +314,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
if inputs_embeds is None:
inputs_embeds = self.wte(input_ids, mode='embedding')
inputs_embeds = self.wte(input_ids, mode="embedding")
position_embeds = self.wpe(position_ids)
if token_type_ids is not None:
token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
token_type_embeds = self.wte(token_type_ids, mode='embedding')
token_type_embeds = self.wte(token_type_ids, mode="embedding")
else:
token_type_embeds = 0
hidden_states = inputs_embeds + position_embeds + token_type_embeds
......@@ -353,6 +363,7 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = GPT2Config
pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "transformer"
......@@ -428,8 +439,12 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
GPT2_START_DOCSTRING,
GPT2_INPUTS_DOCSTRING,
)
class TFGPT2Model(TFGPT2PreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -459,17 +474,22 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFGPT2Model, self).__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name='transformer')
self.transformer = TFGPT2MainLayer(config, name="transformer")
def call(self, inputs, **kwargs):
outputs = self.transformer(inputs, **kwargs)
return outputs
@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
@add_start_docstrings(
"""The GPT2 Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
GPT2_START_DOCSTRING,
GPT2_INPUTS_DOCSTRING,
)
class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -500,9 +520,10 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name='transformer')
self.transformer = TFGPT2MainLayer(config, name="transformer")
def get_output_embeddings(self):
return self.transformer.wte
......@@ -518,11 +539,15 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
return outputs # lm_logits, presents, (all hidden_states), (attentions)
@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
@add_start_docstrings(
"""The GPT2 Model transformer with a language modeling and a multiple-choice classification
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence).
""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
""",
GPT2_START_DOCSTRING,
GPT2_INPUTS_DOCSTRING,
)
class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
r"""
**mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
......@@ -572,16 +597,30 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
config.num_labels = 1
self.transformer = TFGPT2MainLayer(config, name='transformer')
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
self.transformer = TFGPT2MainLayer(config, name="transformer")
self.multiple_choice_head = TFSequenceSummary(
config, initializer_range=config.initializer_range, name="multiple_choice_head"
)
def get_output_embeddings(self):
return self.transformer.wte
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False):
def call(
self,
inputs,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
mc_token_ids=None,
training=False,
):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
past = inputs[1] if len(inputs) > 1 else past
......@@ -593,14 +632,14 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
past = inputs.get('past', past)
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
input_ids = inputs.get("input_ids")
past = inputs.get("past", past)
attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
assert len(inputs) <= 8, "Too many inputs."
else:
input_ids = inputs
......@@ -617,7 +656,15 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
flat_inputs = [
flat_input_ids,
past,
flat_attention_mask,
flat_token_type_ids,
flat_position_ids,
head_mask,
inputs_embeds,
]
transformer_outputs = self.transformer(flat_inputs, training=training)
hidden_states = transformer_outputs[0]
......
......@@ -17,25 +17,28 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import json
import logging
import math
import os
import sys
from io import open
import numpy as np
import tensorflow as tf
from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
TFSequenceSummary, shape_list, get_initializer)
from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_start_docstrings
from .modeling_tf_utils import (
TFConv1D,
TFPreTrainedModel,
TFSequenceSummary,
TFSharedEmbeddings,
get_initializer,
shape_list,
)
logger = logging.getLogger(__name__)
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"}
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {
"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"
}
def gelu(x):
......@@ -47,8 +50,7 @@ def gelu(x):
Returns:
`x` with the GELU activation applied.
"""
cdf = 0.5 * (1.0 + tf.tanh(
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
......@@ -56,9 +58,11 @@ def swish(x):
return x * tf.math.sigmoid(x)
ACT_FNS = {"gelu": tf.keras.layers.Activation(gelu),
ACT_FNS = {
"gelu": tf.keras.layers.Activation(gelu),
"relu": tf.keras.activations.relu,
"swish": tf.keras.layers.Activation(swish)}
"swish": tf.keras.layers.Activation(swish),
}
class TFAttention(tf.keras.layers.Layer):
......@@ -74,8 +78,8 @@ class TFAttention(tf.keras.layers.Layer):
self.split_size = n_state
self.scale = scale
self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
self.pruned_heads = set()
......@@ -88,7 +92,7 @@ class TFAttention(tf.keras.layers.Layer):
"""1's in the lower triangle, counting from the lower right corner.
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
"""
i = tf.range(nd)[:,None]
i = tf.range(nd)[:, None]
j = tf.range(ns)
m = i >= j - ns + nd
return tf.cast(m, dtype)
......@@ -159,8 +163,8 @@ class TFMLP(tf.keras.layers.Layer):
def __init__(self, n_state, config, **kwargs):
super(TFMLP, self).__init__(**kwargs)
nx = config.n_embd
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
self.act = gelu
self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
......@@ -175,10 +179,10 @@ class TFBlock(tf.keras.layers.Layer):
def __init__(self, n_ctx, config, scale=False, **kwargs):
super(TFBlock, self).__init__(**kwargs)
nx = config.n_embd
self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
self.mlp = TFMLP(4 * nx, config, name='mlp')
self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
self.attn = TFAttention(nx, n_ctx, config, scale, name="attn")
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
self.mlp = TFMLP(4 * nx, config, name="mlp")
self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
def call(self, inputs, training=False):
x, attention_mask, head_mask = inputs
......@@ -203,19 +207,17 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
self.vocab_size = config.vocab_size
self.n_embd = config.n_embd
self.tokens_embed = TFSharedEmbeddings(config.vocab_size,
config.n_embd,
initializer_range=config.initializer_range,
name='tokens_embed')
self.positions_embed = tf.keras.layers.Embedding(config.n_positions,
self.tokens_embed = TFSharedEmbeddings(
config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed"
)
self.positions_embed = tf.keras.layers.Embedding(
config.n_positions,
config.n_embd,
embeddings_initializer=get_initializer(config.initializer_range),
name='positions_embed')
name="positions_embed",
)
self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
self.h = [TFBlock(config.n_ctx,
config,
scale=True,
name='h_._{}'.format(i)) for i in range(config.n_layer)]
self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)]
def get_input_embeddings(self):
return self.tokens_embed
......@@ -229,7 +231,16 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
"""
raise NotImplementedError
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
def call(
self,
inputs,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=False,
):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
......@@ -239,12 +250,12 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
assert len(inputs) <= 6, "Too many inputs."
else:
input_ids = inputs
......@@ -286,7 +297,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if not head_mask is None:
if head_mask is not None:
raise NotImplementedError
else:
head_mask = [None] * self.num_hidden_layers
......@@ -295,11 +306,11 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
if inputs_embeds is None:
inputs_embeds = self.tokens_embed(input_ids, mode='embedding')
inputs_embeds = self.tokens_embed(input_ids, mode="embedding")
position_embeds = self.positions_embed(position_ids)
if token_type_ids is not None:
token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
token_type_embeds = self.tokens_embed(token_type_ids, mode='embedding')
token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding")
else:
token_type_embeds = 0
hidden_states = inputs_embeds + position_embeds + token_type_embeds
......@@ -338,6 +349,7 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = OpenAIGPTConfig
pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "transformer"
......@@ -409,8 +421,12 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
@add_start_docstrings(
"The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
OPENAI_GPT_START_DOCSTRING,
OPENAI_GPT_INPUTS_DOCSTRING,
)
class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -436,17 +452,22 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFOpenAIGPTModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
def call(self, inputs, **kwargs):
outputs = self.transformer(inputs, **kwargs)
return outputs
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
@add_start_docstrings(
"""OpenAI GPT Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
OPENAI_GPT_START_DOCSTRING,
OPENAI_GPT_INPUTS_DOCSTRING,
)
class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -472,9 +493,10 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
def get_output_embeddings(self):
return self.transformer.tokens_embed
......@@ -490,11 +512,15 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
return outputs # lm_logits, (all hidden_states), (attentions)
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
@add_start_docstrings(
"""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence).
""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
""",
OPENAI_GPT_START_DOCSTRING,
OPENAI_GPT_INPUTS_DOCSTRING,
)
class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
r"""
**mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
......@@ -536,16 +562,29 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
config.num_labels = 1
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
self.multiple_choice_head = TFSequenceSummary(
config, initializer_range=config.initializer_range, name="multiple_choice_head"
)
def get_output_embeddings(self):
return self.transformer.tokens_embed
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False):
def call(
self,
inputs,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
mc_token_ids=None,
training=False,
):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
......@@ -556,13 +595,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
assert len(inputs) <= 7, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
assert len(inputs) <= 7, "Too many inputs."
else:
input_ids = inputs
......@@ -579,7 +618,14 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds]
flat_inputs = [
flat_input_ids,
flat_attention_mask,
flat_token_type_ids,
flat_position_ids,
head_mask,
inputs_embeds,
]
transformer_outputs = self.transformer(flat_inputs, training=training)
hidden_states = transformer_outputs[0]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment