"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "818997788584b9fc043d8b58e078f63aadb6b60e"
Unverified Commit 228cdd6a authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge branch 'master' into conditional-generation

parents 3cf2020c 079bfb32
...@@ -86,7 +86,6 @@ def glue_convert_examples_to_features(examples, tokenizer, ...@@ -86,7 +86,6 @@ def glue_convert_examples_to_features(examples, tokenizer,
example.text_b, example.text_b,
add_special_tokens=True, add_special_tokens=True,
max_length=max_length, max_length=max_length,
truncate_first_sequence=True # We're truncating the first sequence in priority
) )
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
......
...@@ -27,7 +27,7 @@ logger = logging.getLogger(__name__) # pylint: disable=invalid-name ...@@ -27,7 +27,7 @@ logger = logging.getLogger(__name__) # pylint: disable=invalid-name
try: try:
import tensorflow as tf import tensorflow as tf
assert int(tf.__version__[0]) >= 2 assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
_tf_available = True # pylint: disable=invalid-name _tf_available = True # pylint: disable=invalid-name
logger.info("TensorFlow version {} available.".format(tf.__version__)) logger.info("TensorFlow version {} available.".format(tf.__version__))
except (ImportError, AssertionError): except (ImportError, AssertionError):
...@@ -246,7 +246,7 @@ def http_get(url, temp_file, proxies=None): ...@@ -246,7 +246,7 @@ def http_get(url, temp_file, proxies=None):
progress.close() progress.close()
def get_from_cache(url, cache_dir=None, force_download=False, proxies=None): def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10):
""" """
Given a URL, look for the corresponding dataset in the local cache. Given a URL, look for the corresponding dataset in the local cache.
If it's not there, download it. Then return the path to the cached file. If it's not there, download it. Then return the path to the cached file.
...@@ -266,12 +266,12 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None): ...@@ -266,12 +266,12 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
etag = s3_etag(url, proxies=proxies) etag = s3_etag(url, proxies=proxies)
else: else:
try: try:
response = requests.head(url, allow_redirects=True, proxies=proxies) response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
if response.status_code != 200: if response.status_code != 200:
etag = None etag = None
else: else:
etag = response.headers.get("ETag") etag = response.headers.get("ETag")
except EnvironmentError: except (EnvironmentError, requests.exceptions.Timeout):
etag = None etag = None
if sys.version_info[0] == 2 and etag is not None: if sys.version_info[0] == 2 and etag is not None:
......
...@@ -21,6 +21,7 @@ import logging ...@@ -21,6 +21,7 @@ import logging
from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
from .modeling_ctrl import CTRLModel, CTRLLMHeadModel
from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
...@@ -51,6 +52,7 @@ class AutoModel(object): ...@@ -51,6 +52,7 @@ class AutoModel(object):
- contains `bert`: BertModel (Bert model) - contains `bert`: BertModel (Bert model)
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model) - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model) - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model) - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
- contains `xlnet`: XLNetModel (XLNet model) - contains `xlnet`: XLNetModel (XLNet model)
- contains `xlm`: XLMModel (XLM model) - contains `xlm`: XLMModel (XLM model)
...@@ -73,6 +75,7 @@ class AutoModel(object): ...@@ -73,6 +75,7 @@ class AutoModel(object):
- contains `bert`: BertModel (Bert model) - contains `bert`: BertModel (Bert model)
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model) - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model) - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model) - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
- contains `xlnet`: XLNetModel (XLNet model) - contains `xlnet`: XLNetModel (XLNet model)
- contains `xlm`: XLMModel (XLM model) - contains `xlm`: XLMModel (XLM model)
...@@ -149,10 +152,11 @@ class AutoModel(object): ...@@ -149,10 +152,11 @@ class AutoModel(object):
return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path: elif 'xlm' in pretrained_model_name_or_path:
return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'ctrl' in pretrained_model_name_or_path:
return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of " raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta'".format(pretrained_model_name_or_path)) "'xlm', 'roberta, 'ctrl'".format(pretrained_model_name_or_path))
class AutoModelWithLMHead(object): class AutoModelWithLMHead(object):
...@@ -172,6 +176,7 @@ class AutoModelWithLMHead(object): ...@@ -172,6 +176,7 @@ class AutoModelWithLMHead(object):
- contains `bert`: BertForMaskedLM (Bert model) - contains `bert`: BertForMaskedLM (Bert model)
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model) - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
- contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model) - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
- contains `ctrl`: CTRLLMModel (Salesforce CTRL model)
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model) - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
- contains `xlnet`: XLNetLMHeadModel (XLNet model) - contains `xlnet`: XLNetLMHeadModel (XLNet model)
- contains `xlm`: XLMWithLMHeadModel (XLM model) - contains `xlm`: XLMWithLMHeadModel (XLM model)
...@@ -273,10 +278,11 @@ class AutoModelWithLMHead(object): ...@@ -273,10 +278,11 @@ class AutoModelWithLMHead(object):
return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path: elif 'xlm' in pretrained_model_name_or_path:
return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'ctrl' in pretrained_model_name_or_path:
return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of " raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta'".format(pretrained_model_name_or_path)) "'xlm', 'roberta','ctrl'".format(pretrained_model_name_or_path))
class AutoModelForSequenceClassification(object): class AutoModelForSequenceClassification(object):
......
...@@ -46,6 +46,8 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -46,6 +46,8 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
} }
...@@ -1194,12 +1196,16 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1194,12 +1196,16 @@ class BertForQuestionAnswering(BertPreTrainedModel):
Examples:: Examples::
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased') model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
start_positions = torch.tensor([1]) input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
end_positions = torch.tensor([3]) input_ids = tokenizer.encode(input_text)
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
loss, start_scores, end_scores = outputs[:2] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
# a nice puppet
""" """
def __init__(self, config): def __init__(self, config):
......
This diff is collapsed.
...@@ -159,8 +159,6 @@ class MultiHeadSelfAttention(nn.Module): ...@@ -159,8 +159,6 @@ class MultiHeadSelfAttention(nn.Module):
dim_per_head = self.dim // self.n_heads dim_per_head = self.dim // self.n_heads
assert 2 <= mask.dim() <= 3
causal = (mask.dim() == 3)
mask_reshp = (bs, 1, 1, k_length) mask_reshp = (bs, 1, 1, k_length)
def shape(x): def shape(x):
......
...@@ -347,6 +347,7 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -347,6 +347,7 @@ class GPT2Model(GPT2PreTrainedModel):
super(GPT2Model, self).__init__(config) super(GPT2Model, self).__init__(config)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_past = config.output_past
self.wte = nn.Embedding(config.vocab_size, config.n_embd) self.wte = nn.Embedding(config.vocab_size, config.n_embd)
self.wpe = nn.Embedding(config.n_positions, config.n_embd) self.wpe = nn.Embedding(config.n_positions, config.n_embd)
...@@ -440,7 +441,8 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -440,7 +441,8 @@ class GPT2Model(GPT2PreTrainedModel):
head_mask=head_mask[i]) head_mask=head_mask[i])
hidden_states, present = outputs[:2] hidden_states, present = outputs[:2]
presents = presents + (present,) if self.output_past:
presents = presents + (present,)
if self.output_attentions: if self.output_attentions:
all_attentions.append(outputs[2]) all_attentions.append(outputs[2])
...@@ -452,7 +454,9 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -452,7 +454,9 @@ class GPT2Model(GPT2PreTrainedModel):
if self.output_hidden_states: if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states, presents) outputs = (hidden_states,)
if self.output_past:
outputs = outputs + (presents,)
if self.output_hidden_states: if self.output_hidden_states:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if self.output_attentions: if self.output_attentions:
...@@ -460,7 +464,7 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -460,7 +464,7 @@ class GPT2Model(GPT2PreTrainedModel):
attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:] attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions) all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions)
outputs = outputs + (all_attentions,) outputs = outputs + (all_attentions,)
return outputs # last hidden state, presents, (all hidden_states), (attentions) return outputs # last hidden state, (presents), (all hidden_states), (attentions)
@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top @add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
......
...@@ -170,7 +170,7 @@ class Attention(nn.Module): ...@@ -170,7 +170,7 @@ class Attention(nn.Module):
# w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights # w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights
# XD: self.b may be larger than w, so we need to crop it # XD: self.b may be larger than w, so we need to crop it
b = self.bias[:, :, : w.size(-2), : w.size(-1)] b = self.bias[:, :, : w.size(-2), : w.size(-1)]
w = w * b + -1e9 * (1 - b) w = w * b + - 1e4 * (1 - b)
if attention_mask is not None: if attention_mask is not None:
# Apply the attention mask # Apply the attention mask
......
...@@ -34,6 +34,7 @@ ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -34,6 +34,7 @@ ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin", 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin", 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin", 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
} }
class RobertaEmbeddings(BertEmbeddings): class RobertaEmbeddings(BertEmbeddings):
...@@ -172,7 +173,8 @@ class RobertaModel(BertModel): ...@@ -172,7 +173,8 @@ class RobertaModel(BertModel):
if input_ids[:, 0].sum().item() != 0: if input_ids[:, 0].sum().item() != 0:
logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. " logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
"This model requires special tokens in order to work. " "This model requires special tokens in order to work. "
"Please specify add_special_tokens=True in your encoding.") "Please specify add_special_tokens=True in your tokenize.encode()"
"or tokenizer.convert_tokens_to_ids().")
return super(RobertaModel, self).forward(input_ids, return super(RobertaModel, self).forward(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
...@@ -341,6 +343,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -341,6 +343,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
return outputs # (loss), logits, (hidden_states), (attentions) return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of @add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
...@@ -449,6 +452,81 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -449,6 +452,81 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
@add_start_docstrings("""Roberta Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
class RobertaForTokenClassification(BertPreTrainedModel):
r"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification loss.
**scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
Classification scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForTokenClassification.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
config_class = RobertaConfig
pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "roberta"
def __init__(self, config):
super(RobertaForTokenClassification, self).__init__(config)
self.num_labels = config.num_labels
self.roberta = RobertaModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
position_ids=None, head_mask=None, labels=None):
outputs = self.roberta(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs # (loss), scores, (hidden_states), (attentions)
class RobertaClassificationHead(nn.Module): class RobertaClassificationHead(nn.Module):
"""Head for sentence-level classification tasks.""" """Head for sentence-level classification tasks."""
......
...@@ -26,6 +26,7 @@ from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSeque ...@@ -26,6 +26,7 @@ from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSeque
from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
...@@ -52,6 +53,7 @@ class TFAutoModel(object): ...@@ -52,6 +53,7 @@ class TFAutoModel(object):
- contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model) - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
- contains `xlnet`: TFXLNetModel (XLNet model) - contains `xlnet`: TFXLNetModel (XLNet model)
- contains `xlm`: TFXLMModel (XLM model) - contains `xlm`: TFXLMModel (XLM model)
- contains `ctrl`: TFCTRLModel (CTRL model)
This class cannot be instantiated using `__init__()` (throws an error). This class cannot be instantiated using `__init__()` (throws an error).
""" """
...@@ -73,7 +75,7 @@ class TFAutoModel(object): ...@@ -73,7 +75,7 @@ class TFAutoModel(object):
- contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model) - contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
- contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model) - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
- contains `xlnet`: TFXLNetModel (XLNet model) - contains `xlnet`: TFXLNetModel (XLNet model)
- contains `xlm`: TFXLMModel (XLM model) - contains `ctrl`: TFCTRLModel (CTRL model)
Params: Params:
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
...@@ -147,10 +149,12 @@ class TFAutoModel(object): ...@@ -147,10 +149,12 @@ class TFAutoModel(object):
return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path: elif 'xlm' in pretrained_model_name_or_path:
return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'ctrl' in pretrained_model_name_or_path:
return TFCTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of " raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta'".format(pretrained_model_name_or_path)) "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
class TFAutoModelWithLMHead(object): class TFAutoModelWithLMHead(object):
...@@ -173,6 +177,7 @@ class TFAutoModelWithLMHead(object): ...@@ -173,6 +177,7 @@ class TFAutoModelWithLMHead(object):
- contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model) - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
- contains `xlnet`: TFXLNetLMHeadModel (XLNet model) - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
- contains `xlm`: TFXLMWithLMHeadModel (XLM model) - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
- contains `ctrl`: TFCTRLLMHeadModel (CTRL model)
This class cannot be instantiated using `__init__()` (throws an error). This class cannot be instantiated using `__init__()` (throws an error).
""" """
...@@ -198,6 +203,7 @@ class TFAutoModelWithLMHead(object): ...@@ -198,6 +203,7 @@ class TFAutoModelWithLMHead(object):
- contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model) - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
- contains `xlnet`: TFXLNetLMHeadModel (XLNet model) - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
- contains `xlm`: TFXLMWithLMHeadModel (XLM model) - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
- contains `ctrl`: TFCTRLLMHeadModel (CTRL model)
Params: Params:
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
...@@ -271,10 +277,12 @@ class TFAutoModelWithLMHead(object): ...@@ -271,10 +277,12 @@ class TFAutoModelWithLMHead(object):
return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path: elif 'xlm' in pretrained_model_name_or_path:
return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'ctrl' in pretrained_model_name_or_path:
return TFCTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of " raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta'".format(pretrained_model_name_or_path)) "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
class TFAutoModelForSequenceClassification(object): class TFAutoModelForSequenceClassification(object):
......
...@@ -30,7 +30,6 @@ import tensorflow as tf ...@@ -30,7 +30,6 @@ import tensorflow as tf
from .configuration_bert import BertConfig from .configuration_bert import BertConfig
from .modeling_tf_utils import TFPreTrainedModel, get_initializer from .modeling_tf_utils import TFPreTrainedModel, get_initializer
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -52,14 +51,6 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -52,14 +51,6 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
} }
def load_bert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
def gelu(x): def gelu(x):
""" Gaussian Error Linear Unit. """ Gaussian Error Linear Unit.
Original Implementation of the gelu activation function in Google Bert repo when initially created. Original Implementation of the gelu activation function in Google Bert repo when initially created.
...@@ -545,7 +536,6 @@ class TFBertPreTrainedModel(TFPreTrainedModel): ...@@ -545,7 +536,6 @@ class TFBertPreTrainedModel(TFPreTrainedModel):
""" """
config_class = BertConfig config_class = BertConfig
pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_bert_pt_weights_in_tf2
base_model_prefix = "bert" base_model_prefix = "bert"
......
This diff is collapsed.
...@@ -31,7 +31,6 @@ import tensorflow as tf ...@@ -31,7 +31,6 @@ import tensorflow as tf
from .configuration_distilbert import DistilBertConfig from .configuration_distilbert import DistilBertConfig
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -66,14 +65,6 @@ def gelu_new(x): ...@@ -66,14 +65,6 @@ def gelu_new(x):
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf return x * cdf
def load_distilbert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
tf_inputs = [inputs_list, attns_list]
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
class TFEmbeddings(tf.keras.layers.Layer): class TFEmbeddings(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super(TFEmbeddings, self).__init__(**kwargs) super(TFEmbeddings, self).__init__(**kwargs)
...@@ -226,8 +217,6 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer): ...@@ -226,8 +217,6 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
dim_per_head = self.dim // self.n_heads dim_per_head = self.dim // self.n_heads
assert 2 <= len(tf.shape(mask)) <= 3
causal = (len(tf.shape(mask)) == 3)
mask_reshape = [bs, 1, 1, k_length] mask_reshape = [bs, 1, 1, k_length]
def shape(x): def shape(x):
...@@ -456,7 +445,6 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel): ...@@ -456,7 +445,6 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel):
""" """
config_class = DistilBertConfig config_class = DistilBertConfig
pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_distilbert_pt_weights_in_tf2
base_model_prefix = "distilbert" base_model_prefix = "distilbert"
......
...@@ -32,7 +32,6 @@ from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings, ...@@ -32,7 +32,6 @@ from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
TFSequenceSummary, shape_list, get_initializer) TFSequenceSummary, shape_list, get_initializer)
from .configuration_gpt2 import GPT2Config from .configuration_gpt2 import GPT2Config
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -42,14 +41,6 @@ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models ...@@ -42,14 +41,6 @@ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",} "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",}
def load_gpt2_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
def gelu(x): def gelu(x):
"""Gaussian Error Linear Unit. """Gaussian Error Linear Unit.
This is a smoother version of the RELU. This is a smoother version of the RELU.
...@@ -350,7 +341,6 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel): ...@@ -350,7 +341,6 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel):
""" """
config_class = GPT2Config config_class = GPT2Config
pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_gpt2_pt_weights_in_tf2
base_model_prefix = "transformer" base_model_prefix = "transformer"
......
...@@ -32,21 +32,12 @@ from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings, ...@@ -32,21 +32,12 @@ from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
TFSequenceSummary, shape_list, get_initializer) TFSequenceSummary, shape_list, get_initializer)
from .configuration_openai import OpenAIGPTConfig from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"} TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"}
def load_openai_gpt_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
def gelu(x): def gelu(x):
"""Gaussian Error Linear Unit. """Gaussian Error Linear Unit.
This is a smoother version of the RELU. This is a smoother version of the RELU.
...@@ -335,7 +326,6 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): ...@@ -335,7 +326,6 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
""" """
config_class = OpenAIGPTConfig config_class = OpenAIGPTConfig
pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_openai_gpt_pt_weights_in_tf2
base_model_prefix = "transformer" base_model_prefix = "transformer"
......
...@@ -25,8 +25,6 @@ import numpy ...@@ -25,8 +25,6 @@ import numpy
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=''): def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=''):
""" Convert a TF 2.0 model variable name in a pytorch model weight name. """ Convert a TF 2.0 model variable name in a pytorch model weight name.
...@@ -105,7 +103,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a ...@@ -105,7 +103,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
raise e raise e
if tf_inputs is None: if tf_inputs is None:
tf_inputs = tf.constant(DUMMY_INPUTS) tf_inputs = tf_model.dummy_inputs
if tf_inputs is not None: if tf_inputs is not None:
tfo = tf_model(tf_inputs, training=False) # Make sure model is built tfo = tf_model(tf_inputs, training=False) # Make sure model is built
...@@ -200,7 +198,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs ...@@ -200,7 +198,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
tf_model = tf_model_class(pt_model.config) tf_model = tf_model_class(pt_model.config)
if tf_inputs is None: if tf_inputs is None:
tf_inputs = tf.constant(DUMMY_INPUTS) tf_inputs = tf_model.dummy_inputs
if tf_inputs is not None: if tf_inputs is not None:
tfo = tf_model(tf_inputs, training=False) # Make sure model is built tfo = tf_model(tf_inputs, training=False) # Make sure model is built
......
...@@ -26,7 +26,6 @@ import tensorflow as tf ...@@ -26,7 +26,6 @@ import tensorflow as tf
from .configuration_roberta import RobertaConfig from .configuration_roberta import RobertaConfig
from .modeling_tf_utils import TFPreTrainedModel, get_initializer from .modeling_tf_utils import TFPreTrainedModel, get_initializer
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
...@@ -36,16 +35,9 @@ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -36,16 +35,9 @@ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5", 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5",
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5", 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5",
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5", 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5",
'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-tf_model.h5",
} }
def load_roberta_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
class TFRobertaEmbeddings(TFBertEmbeddings): class TFRobertaEmbeddings(TFBertEmbeddings):
""" """
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
...@@ -83,7 +75,7 @@ class TFRobertaMainLayer(TFBertMainLayer): ...@@ -83,7 +75,7 @@ class TFRobertaMainLayer(TFBertMainLayer):
input_ids = inputs input_ids = inputs
if tf.not_equal(tf.reduce_sum(input_ids[:, 0]), 0): if tf.not_equal(tf.reduce_sum(input_ids[:, 0]), 0):
logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. " tf.print("A sequence with no special tokens has been passed to the RoBERTa model. "
"This model requires special tokens in order to work. " "This model requires special tokens in order to work. "
"Please specify add_special_tokens=True in your encoding.") "Please specify add_special_tokens=True in your encoding.")
...@@ -96,7 +88,6 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel): ...@@ -96,7 +88,6 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel):
""" """
config_class = RobertaConfig config_class = RobertaConfig
pretrained_model_archive_map = TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_roberta_pt_weights_in_tf2
base_model_prefix = "roberta" base_model_prefix = "roberta"
...@@ -380,3 +371,54 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): ...@@ -380,3 +371,54 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
outputs = (logits,) + outputs[2:] outputs = (logits,) + outputs[2:]
return outputs # logits, (hidden_states), (attentions) return outputs # logits, (hidden_states), (attentions)
@add_start_docstrings("""RoBERTa Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
Classification scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForTokenClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
scores = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFRobertaForTokenClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.roberta = TFRobertaMainLayer(config, name='roberta')
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='classifier')
def call(self, inputs, **kwargs):
outputs = self.roberta(inputs, **kwargs)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
return outputs # scores, (hidden_states), (attentions)
...@@ -33,7 +33,6 @@ from .configuration_transfo_xl import TransfoXLConfig ...@@ -33,7 +33,6 @@ from .configuration_transfo_xl import TransfoXLConfig
from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list, get_initializer from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list, get_initializer
from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -41,14 +40,6 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -41,14 +40,6 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5", 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5",
} }
def load_transfo_xl_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
class TFPositionalEmbedding(tf.keras.layers.Layer): class TFPositionalEmbedding(tf.keras.layers.Layer):
def __init__(self, demb, **kwargs): def __init__(self, demb, **kwargs):
super(TFPositionalEmbedding, self).__init__(**kwargs) super(TFPositionalEmbedding, self).__init__(**kwargs)
...@@ -577,7 +568,6 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel): ...@@ -577,7 +568,6 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
""" """
config_class = TransfoXLConfig config_class = TransfoXLConfig
pretrained_model_archive_map = TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_transfo_xl_pt_weights_in_tf2
base_model_prefix = "transformer" base_model_prefix = "transformer"
......
...@@ -25,9 +25,11 @@ import tensorflow as tf ...@@ -25,9 +25,11 @@ import tensorflow as tf
from .configuration_utils import PretrainedConfig from .configuration_utils import PretrainedConfig
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
class TFPreTrainedModel(tf.keras.Model): class TFPreTrainedModel(tf.keras.Model):
r""" Base class for all TF models. r""" Base class for all TF models.
...@@ -48,8 +50,8 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -48,8 +50,8 @@ class TFPreTrainedModel(tf.keras.Model):
""" """
config_class = None config_class = None
pretrained_model_archive_map = {} pretrained_model_archive_map = {}
load_pt_weights = lambda model, config, path: None
base_model_prefix = "" base_model_prefix = ""
dummy_inputs = tf.constant(DUMMY_INPUTS) # dummy inputs to build the network
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs) super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
...@@ -262,17 +264,16 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -262,17 +264,16 @@ class TFPreTrainedModel(tf.keras.Model):
if from_pt: if from_pt:
# Load from a PyTorch checkpoint # Load from a PyTorch checkpoint
return cls.load_pt_weights(model, resolved_archive_file) return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file)
inputs = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]) ret = model(model.dummy_inputs, training=False) # build the network with dummy inputs
ret = model(inputs, training=False) # build the network with dummy inputs
assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file) assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
# 'by_name' allow us to do transfer learning by skipping/adding layers # 'by_name' allow us to do transfer learning by skipping/adding layers
# see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357 # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357
model.load_weights(resolved_archive_file, by_name=True) model.load_weights(resolved_archive_file, by_name=True)
ret = model(inputs, training=False) # Make sure restore ops are run ret = model(model.dummy_inputs, training=False) # Make sure restore ops are run
return model return model
...@@ -393,26 +394,26 @@ class TFSequenceSummary(tf.keras.layers.Layer): ...@@ -393,26 +394,26 @@ class TFSequenceSummary(tf.keras.layers.Layer):
# We can probably just use the multi-head attention module of PyTorch >=1.1.0 # We can probably just use the multi-head attention module of PyTorch >=1.1.0
raise NotImplementedError raise NotImplementedError
self.summary = None self.has_summary = hasattr(config, 'summary_use_proj') and config.summary_use_proj
if hasattr(config, 'summary_use_proj') and config.summary_use_proj: if self.has_summary:
if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0: if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
num_classes = config.num_labels num_classes = config.num_labels
else: else:
num_classes = config.hidden_size num_classes = config.hidden_size
self.summary = tf.keras.layers.Dense(num_classes, self.summary = tf.keras.layers.Dense(num_classes,
kernel_initializer=get_initializer(initializer_range), kernel_initializer=get_initializer(initializer_range),
name='summary') name='summary')
self.activation = None self.has_activation = hasattr(config, 'summary_activation') and config.summary_activation == 'tanh'
if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh': if self.has_activation:
self.activation = tf.keras.activations.tanh self.activation = tf.keras.activations.tanh
self.first_dropout = None self.has_first_dropout = hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0
if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0: if self.has_first_dropout:
self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout) self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
self.last_dropout = None self.has_last_dropout = hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0
if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0: if self.has_last_dropout:
self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout) self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
def call(self, inputs, training=False): def call(self, inputs, training=False):
...@@ -455,17 +456,17 @@ class TFSequenceSummary(tf.keras.layers.Layer): ...@@ -455,17 +456,17 @@ class TFSequenceSummary(tf.keras.layers.Layer):
elif self.summary_type == 'attn': elif self.summary_type == 'attn':
raise NotImplementedError raise NotImplementedError
if training and self.first_dropout is not None: if self.has_first_dropout:
output = self.first_dropout(output) output = self.first_dropout(output, training=training)
if self.summary is not None: if self.has_summary:
output = self.summary(output) output = self.summary(output)
if self.activation is not None: if self.has_activation:
output = self.activation(output) output = self.activation(output)
if training and self.last_dropout is not None: if self.has_last_dropout:
output = self.last_dropout(output) output = self.last_dropout(output, training=training)
return output return output
......
...@@ -25,9 +25,8 @@ import numpy as np ...@@ -25,9 +25,8 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_xlm import XLMConfig from .configuration_xlm import XLMConfig
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer, DUMMY_INPUTS
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -45,19 +44,6 @@ TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -45,19 +44,6 @@ TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
} }
def load_xlm_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
if tf_model.config.use_lang_emb and tf_model.config.n_langs > 1:
langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
else:
langs_list = None
tf_inputs = [inputs_list, attns_list, langs_list]
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
def create_sinusoidal_embeddings(n_pos, dim, out): def create_sinusoidal_embeddings(n_pos, dim, out):
position_enc = np.array([ position_enc = np.array([
[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
...@@ -441,9 +427,19 @@ class TFXLMPreTrainedModel(TFPreTrainedModel): ...@@ -441,9 +427,19 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
""" """
config_class = XLMConfig config_class = XLMConfig
pretrained_model_archive_map = TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_xlm_pt_weights_in_tf2
base_model_prefix = "transformer" base_model_prefix = "transformer"
@property
def dummy_inputs(self):
# Sometimes XLM has language embeddings so don't forget to build them as well if needed
inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
if self.config.use_lang_emb and self.config.n_langs > 1:
langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
else:
langs_list = None
return [inputs_list, attns_list, langs_list]
XLM_START_DOCSTRING = r""" The XLM model was proposed in XLM_START_DOCSTRING = r""" The XLM model was proposed in
`Cross-lingual Language Model Pretraining`_ `Cross-lingual Language Model Pretraining`_
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment