Unverified Commit a52d56c8 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge branch 'master' into cleanup-configs

parents 8ade2040 e92bcb7e
...@@ -131,8 +131,9 @@ class HfApi: ...@@ -131,8 +131,9 @@ class HfApi:
# the client still has to specify it when uploading the file. # the client still has to specify it when uploading the file.
with open(filepath, "rb") as f: with open(filepath, "rb") as f:
pf = TqdmProgressFileReader(f) pf = TqdmProgressFileReader(f)
data = f if pf.total_size > 0 else ""
r = requests.put(urls.write, data=f, headers={ r = requests.put(urls.write, data=data, headers={
"content-type": urls.type, "content-type": urls.type,
}) })
r.raise_for_status() r.raise_for_status()
......
...@@ -29,6 +29,7 @@ from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequen ...@@ -29,6 +29,7 @@ from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequen
from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
from .modeling_t5 import T5Model, T5WithLMHeadModel
from .modeling_utils import PreTrainedModel, SequenceSummary from .modeling_utils import PreTrainedModel, SequenceSummary
...@@ -49,6 +50,7 @@ class AutoModel(object): ...@@ -49,6 +50,7 @@ class AutoModel(object):
The base model class to instantiate is selected as the first pattern matching The base model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: T5Model (T5 model)
- contains `distilbert`: DistilBertModel (DistilBERT model) - contains `distilbert`: DistilBertModel (DistilBERT model)
- contains `albert`: AlbertModel (ALBERT model) - contains `albert`: AlbertModel (ALBERT model)
- contains `camembert`: CamembertModel (CamemBERT model) - contains `camembert`: CamembertModel (CamemBERT model)
...@@ -74,6 +76,7 @@ class AutoModel(object): ...@@ -74,6 +76,7 @@ class AutoModel(object):
The model class to instantiate is selected as the first pattern matching The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: T5Model (T5 model)
- contains `distilbert`: DistilBertModel (DistilBERT model) - contains `distilbert`: DistilBertModel (DistilBERT model)
- contains `albert`: AlbertModel (ALBERT model) - contains `albert`: AlbertModel (ALBERT model)
- contains `camembert`: CamembertModel (CamemBERT model) - contains `camembert`: CamembertModel (CamemBERT model)
...@@ -146,7 +149,9 @@ class AutoModel(object): ...@@ -146,7 +149,9 @@ class AutoModel(object):
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
""" """
if 'distilbert' in pretrained_model_name_or_path: if 't5' in pretrained_model_name_or_path:
return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'albert' in pretrained_model_name_or_path: elif 'albert' in pretrained_model_name_or_path:
return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
...@@ -185,6 +190,7 @@ class AutoModelWithLMHead(object): ...@@ -185,6 +190,7 @@ class AutoModelWithLMHead(object):
The model class to instantiate is selected as the first pattern matching The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: T5ModelWithLMHead (T5 model)
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model) - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
- contains `albert`: AlbertForMaskedLM (ALBERT model) - contains `albert`: AlbertForMaskedLM (ALBERT model)
- contains `camembert`: CamembertForMaskedLM (CamemBERT model) - contains `camembert`: CamembertForMaskedLM (CamemBERT model)
...@@ -213,6 +219,7 @@ class AutoModelWithLMHead(object): ...@@ -213,6 +219,7 @@ class AutoModelWithLMHead(object):
The model class to instantiate is selected as the first pattern matching The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: T5ModelWithLMHead (T5 model)
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model) - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
- contains `albert`: AlbertForMaskedLM (ALBERT model) - contains `albert`: AlbertForMaskedLM (ALBERT model)
- contains `camembert`: CamembertForMaskedLM (CamemBERT model) - contains `camembert`: CamembertForMaskedLM (CamemBERT model)
...@@ -284,7 +291,9 @@ class AutoModelWithLMHead(object): ...@@ -284,7 +291,9 @@ class AutoModelWithLMHead(object):
model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
""" """
if 'distilbert' in pretrained_model_name_or_path: if 't5' in pretrained_model_name_or_path:
return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'albert' in pretrained_model_name_or_path: elif 'albert' in pretrained_model_name_or_path:
return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
......
...@@ -219,9 +219,7 @@ class PreTrainedEncoderDecoder(nn.Module): ...@@ -219,9 +219,7 @@ class PreTrainedEncoderDecoder(nn.Module):
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
if encoder_hidden_states is None: if encoder_hidden_states is None:
encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder) encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
encoder_hidden_states = encoder_outputs[ encoder_hidden_states = encoder_outputs[0]
0
] # output the last layer hidden state
else: else:
encoder_outputs = () encoder_outputs = ()
......
# coding=utf-8
# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch T5 model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import math
import os
import sys
import copy
import itertools
from io import open
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss, MSELoss
from .modeling_utils import PreTrainedModel, prune_linear_layer
from .configuration_t5 import T5Config
from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
logger = logging.getLogger(__name__)
####################################################
# This dict contrains shortcut names and associated url
# for the pretrained weights provided with the models
####################################################
T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
}
####################################################
# This is a conversion method from TF 1.0 to PyTorch
# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
####################################################
def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
""" Load tf checkpoints in a pytorch model.
"""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.")
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
tf_weights = {}
for name, shape in init_vars:
logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
names.append(name)
tf_weights[name] = array
for txt_name in names:
name = txt_name.split('/')
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
logger.info("Skipping {}".format("/".join(name)))
tf_weights.pop(txt_name, None)
continue
if '_slot_' in name[-1]:
logger.info("Skipping {}".format("/".join(name)))
tf_weights.pop(txt_name, None)
continue
pointer = model
array = tf_weights[txt_name]
for m_name in name:
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
l = re.split(r'_(\d+)', m_name)
else:
l = [m_name]
if l[0] in ['kernel', 'scale', 'embedding']:
pointer = getattr(pointer, 'weight')
# elif l[0] == 'scale':
# pointer = getattr(pointer, 'weight')
# elif l[0] == 'output_bias' or l[0] == 'beta':
# pointer = getattr(pointer, 'bias')
# elif l[0] == 'squad':
# pointer = getattr(pointer, 'classifier')
else:
try:
pointer = getattr(pointer, l[0])
except AttributeError:
logger.info("Skipping {}".format("/".join(name)))
continue
if len(l) >= 2:
num = int(l[1])
pointer = pointer[num]
if l[0] not in ['kernel', 'scale', 'embedding']:
pointer = getattr(pointer, 'weight')
if l[0] != 'embedding':
logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
array = np.transpose(array)
try:
assert pointer.shape == array.shape
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array.astype(np.float32))
tf_weights.pop(txt_name, None)
logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
# logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
return model
####################################################
# PyTorch Models are constructed by sub-classing
# - torch.nn.Module for the layers and
# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
####################################################
class T5LayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
""" Construct a layernorm module in the T5 style
No bias and no substraction of mean.
"""
super(T5LayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, x):
variance = x.pow(2).mean(-1, keepdim=True)
x = x / torch.sqrt(variance + self.variance_epsilon)
return self.weight * x
class T5DenseReluDense(nn.Module):
def __init__(self, config):
super(T5DenseReluDense, self).__init__()
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(self, hidden_states):
h = self.wi(hidden_states)
h = F.relu(h)
h = self.dropout(h)
h = self.wo(h)
return h
class T5LayerFF(nn.Module):
def __init__(self, config):
super(T5LayerFF, self).__init__()
self.DenseReluDense = T5DenseReluDense(config)
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(self, hidden_states):
norm_x = self.layer_norm(hidden_states)
y = self.DenseReluDense(norm_x)
layer_output = hidden_states + self.dropout(y)
return layer_output
class T5Attention(nn.Module):
NEW_ID = itertools.count()
def __init__(self, config, has_relative_attention_bias=False):
super(T5Attention, self).__init__()
self.layer_id = next(T5Attention.NEW_ID)
self.is_decoder = config.is_decoder
self.has_relative_attention_bias = has_relative_attention_bias
self.output_attentions = config.output_attentions
self.relative_attention_num_buckets = config.relative_attention_num_buckets
self.d_model = config.d_model
self.d_kv = config.d_kv
self.n_heads = config.num_heads
self.dropout = config.dropout_rate
self.inner_dim = self.n_heads * self.d_kv
# Mesh TensorFlow initialization to avoid scaling before softmax
self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
if self.has_relative_attention_bias:
self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
mask = torch.ones(self.n_heads, self.d_kv)
heads = set(heads) - self.pruned_heads
for head in heads:
head -= sum(1 if h < head else 0 for h in self.pruned_heads)
mask[head] = 0
mask = mask.view(-1).contiguous().eq(1)
index = torch.arange(len(mask))[mask].long()
# Prune linear layers
self.q = prune_linear_layer(self.q, index)
self.k = prune_linear_layer(self.k, index)
self.v = prune_linear_layer(self.v, index)
self.o = prune_linear_layer(self.o, index, dim=1)
# Update hyper params
self.n_heads = self.n_heads - len(heads)
self.inner_dim = self.d_kv * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
@staticmethod
def _relative_position_bucket(relative_position,
bidirectional=True,
num_buckets=32,
max_distance=128):
"""
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
Translate relative position to a bucket number for relative attention.
The relative position is defined as memory_position - query_position, i.e.
the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are
invalid.
We use smaller buckets for small absolute relative_position and larger buckets
for larger absolute relative_positions. All relative positions >=max_distance
map to the same bucket. All relative positions <=-max_distance map to the
same bucket. This should allow for more graceful generalization to longer
sequences than the model has been trained on.
Args:
relative_position: an int32 Tensor
bidirectional: a boolean - whether the attention is bidirectional
num_buckets: an integer
max_distance: an integer
Returns:
a Tensor with the same shape as relative_position, containing int32
values in the range [0, num_buckets)
"""
ret = 0
n = -relative_position
if bidirectional:
num_buckets //= 2
ret += (n < 0).to(torch.long) * num_buckets # mtf.to_int32(mtf.less(n, 0)) * num_buckets
n = torch.abs(n)
else:
n = torch.max(n, torch.zeros_like(n))
# now n is in the range [0, inf)
# half of the buckets are for exact increments in positions
max_exact = num_buckets // 2
is_small = (n < max_exact)
# The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
val_if_large = max_exact + (
torch.log(n.float() / max_exact)
/ math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long)
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
def compute_bias(self, qlen, klen):
""" Compute binned relative position bias """
context_position = torch.arange(qlen, dtype=torch.long)[:, None]
memory_position = torch.arange(klen, dtype=torch.long)[None, :]
relative_position = memory_position - context_position # shape (qlen, klen)
rp_bucket = self._relative_position_bucket(relative_position, # shape (qlen, klen)
bidirectional=not self.is_decoder,
num_buckets=self.relative_attention_num_buckets)
values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads)
values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen)
return values
def forward(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None):
"""
Self-attention (if kv is None) or attention over source sentence (provided by kv).
"""
# Input is (bs, qlen, dim)
# Mask is (bs, klen) (non-causal) or (bs, klen, klen)
bs, qlen, dim = input.size()
if kv is None:
klen = qlen if cache is None else cache['slen'] + qlen
else:
klen = kv.size(1)
def shape(x):
""" projection """
return x.view(bs, -1, self.n_heads, self.d_kv).transpose(1, 2)
def unshape(x):
""" compute context """
return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim)
q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head)
if kv is None:
k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head)
v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head)
elif cache is None or self.layer_id not in cache:
k = v = kv
k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head)
v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head)
if cache is not None:
if self.layer_id in cache:
if kv is None:
k_, v_ = cache[self.layer_id]
k = torch.cat([k_, k], dim=2) # (bs, n_heads, klen, dim_per_head)
v = torch.cat([v_, v], dim=2) # (bs, n_heads, klen, dim_per_head)
else:
k, v = cache[self.layer_id]
cache[self.layer_id] = (k, v)
# q = q / math.sqrt(dim_per_head) # No scaling in T5
scores = torch.einsum('bnqd,bnkd->bnqk', q, k) # (bs, n_heads, qlen, klen)
if position_bias is None:
if not self.has_relative_attention_bias:
raise ValueError("No position_bias provided and no weights to compute position_bias")
position_bias = self.compute_bias(qlen, klen)
if mask is not None:
position_bias = position_bias + mask # (bs, n_heads, qlen, klen)
scores += position_bias
weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen)
weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen)
# Mask heads if we want to
if head_mask is not None:
weights = weights * head_mask
context = torch.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head)
context = unshape(context) # (bs, qlen, dim)
context = self.o(context)
outputs = (context,)
if self.output_attentions:
outputs = outputs + (weights,)
if self.has_relative_attention_bias:
outputs = outputs + (position_bias,)
return outputs
class T5LayerSelfAttention(nn.Module):
def __init__(self, config, has_relative_attention_bias=False):
super(T5LayerSelfAttention, self).__init__()
self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
norm_x = self.layer_norm(hidden_states)
attention_output = self.SelfAttention(norm_x,
mask=attention_mask,
position_bias=position_bias,
head_mask=head_mask)
y = attention_output[0]
layer_output = hidden_states + self.dropout(y)
outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
return outputs
class T5LayerCrossAttention(nn.Module):
def __init__(self, config, has_relative_attention_bias=False):
super(T5LayerCrossAttention, self).__init__()
self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
norm_x = self.layer_norm(hidden_states)
attention_output = self.EncDecAttention(norm_x,
mask=attention_mask,
kv=kv,
position_bias=position_bias,
head_mask=head_mask)
y = attention_output[0]
layer_output = hidden_states + self.dropout(y)
outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
return outputs
class T5Block(nn.Module):
def __init__(self, config, has_relative_attention_bias=False):
super(T5Block, self).__init__()
self.is_decoder = config.is_decoder
self.layer = nn.ModuleList()
self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
if self.is_decoder:
self.layer.append(T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias))
self.layer.append(T5LayerFF(config))
else:
self.layer.append(T5LayerFF(config))
def forward(self, hidden_states, attention_mask=None, position_bias=None,
encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
head_mask=None):
self_attention_outputs = self.layer[0](hidden_states,
attention_mask=attention_mask,
position_bias=position_bias,
head_mask=head_mask)
hidden_states = self_attention_outputs[0]
outputs = self_attention_outputs[1:] # Keep self-attention outputs and relative position weights
if not self.is_decoder:
hidden_states = self.layer[1](hidden_states)
else:
cross_attention_outputs = self.layer[1](hidden_states,
kv=encoder_hidden_states,
attention_mask=encoder_attention_mask,
position_bias=encoder_decoder_position_bias,
head_mask=head_mask)
hidden_states = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:] # Keep cross-attention outputs and relative position weights
hidden_states = self.layer[2](hidden_states)
outputs = (hidden_states,) + outputs # add attentions if we output them
return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
class T5PreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = T5Config
pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
load_tf_weights = load_tf_weights_in_t5
base_model_prefix = "transformer"
@property
def dummy_inputs(self):
input_ids = torch.tensor(DUMMY_INPUTS)
input_mask = torch.tensor(DUMMY_MASK)
dummy_inputs = {'decoder_input_ids': input_ids,
'encoder_input_ids': input_ids,
'decoder_attention_mask': input_mask}
return dummy_inputs
def _init_weights(self, module):
""" Initialize the weights """
factor = self.config.initializer_factor # Used for testing weights initialization
if isinstance(module, T5LayerNorm):
module.weight.data.fill_(factor*1.0)
elif isinstance(module, (T5Model, T5WithLMHeadModel)):
# Mesh TensorFlow embeddings initialization
# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
module.shared.weight.data.normal_(mean=0.0, std=factor*1.0)
elif isinstance(module, T5DenseReluDense):
# Mesh TensorFlow FF initialization
# See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
# and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
module.wi.weight.data.normal_(mean=0.0, std=factor*((self.config.d_model) ** -0.5))
if hasattr(module.wi, 'bias') and module.wi.bias is not None:
module.wi.bias.data.zero_()
module.wo.weight.data.normal_(mean=0.0, std=factor*((self.config.d_ff) ** -0.5))
if hasattr(module.wo, 'bias') and module.wo.bias is not None:
module.wo.bias.data.zero_()
elif isinstance(module, T5Attention):
# Mesh TensorFlow attention initialization to avoid scaling before softmax
# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
d_model = self.config.d_model
d_kv = self.config.d_kv
n_heads = self.config.num_heads
module.q.weight.data.normal_(mean=0.0, std=factor*((d_model * d_kv) ** -0.5))
module.k.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
module.v.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
module.o.weight.data.normal_(mean=0.0, std=factor*((n_heads * d_kv) ** -0.5))
if module.has_relative_attention_bias:
module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor*((d_model) ** -0.5))
class T5Stack(T5PreTrainedModel):
def __init__(self, config):
super(T5Stack, self).__init__(config)
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.is_decoder = config.is_decoder
self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
for i in range(config.num_layers)])
self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
self.init_weights()
def forward(self,
hidden_states,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
head_mask=None):
batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
if attention_mask is None:
attention_mask = torch.ones(batch_size, seq_length).to(hidden_states.device)
if self.is_decoder and encoder_attention_mask is None:
encoder_seq_length = encoder_hidden_states.shape[1]
encoder_attention_mask = torch.ones(batch_size, encoder_seq_length).to(hidden_states.device)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
if attention_mask.dim() == 3:
extended_attention_mask = attention_mask[:, None, :, :]
elif attention_mask.dim() == 2:
# Provided a padding mask of dimensions [batch_size, seq_length]
# - if the model is a decoder, apply a causal mask in addition to the padding mask
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder:
seq_ids = torch.arange(seq_length, device=hidden_states.device)
causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
causal_mask = causal_mask.to(attention_mask)
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
else:
extended_attention_mask = attention_mask[:, None, None, :]
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -1e9 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
# T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
# Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
# extended_attention_mask = (extended_attention_mask == extended_attention_mask.transpose(-1, -2))
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
if self.is_decoder:
# If a 2D ou 3D attention mask is provided for the cross-attention
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
if encoder_attention_mask.dim() == 3:
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
if encoder_attention_mask.dim() == 2:
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
# T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
# Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
# encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2))
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
else:
encoder_extended_attention_mask = None
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if head_mask is not None:
if head_mask.dim() == 1:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.config.num_layers
all_hidden_states = ()
all_attentions = ()
position_bias = None
encoder_decoder_position_bias = None
hidden_states = self.dropout(hidden_states)
for i, layer_module in enumerate(self.block):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(hidden_states,
attention_mask=extended_attention_mask,
position_bias=position_bias,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
encoder_decoder_position_bias=encoder_decoder_position_bias,
head_mask=head_mask[i])
# layer_outputs is a tuple with:
# hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
hidden_states = layer_outputs[0]
if i == 0:
# We share the position biases between the layers - the first layer store them
# layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
position_bias = layer_outputs[2 if self.output_attentions else 1]
if self.is_decoder:
encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
if self.output_attentions:
all_attentions = all_attentions + (layer_outputs[1],) # We keep only self-attention weights for now
hidden_states = self.final_layer_norm(hidden_states)
layer_output = self.dropout(hidden_states)
# Add last layer
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if self.output_hidden_states:
outputs = outputs + (all_hidden_states,)
if self.output_attentions:
outputs = outputs + (all_attentions,)
return outputs # last-layer hidden state, (all hidden states), (all attentions)
T5_START_DOCSTRING = r""" The T5 model was proposed in
`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
refer to the PyTorch documentation for all matter related to general usage and behavior.
.. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
https://arxiv.org/abs/1910.10683
.. _`torch.nn.Module`:
https://pytorch.org/docs/stable/nn.html#module
Parameters:
config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
T5_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs:
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
(b) For single sequences:
``tokens: [CLS] the dog is hairy . [SEP]``
T5 is a model with relative position embeddings so you should be able to pad the inputs on
the right or the left.
Indices can be obtained using :class:`transformers.T5Tokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
"without any specific head on top.",
T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
class T5Model(T5PreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the output of the last layer of the model.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5Model.from_pretrained('t5-small')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config):
super(T5Model, self).__init__(config)
self.shared = nn.Embedding(config.vocab_size, config.d_model)
encoder_config = copy.deepcopy(config)
self.encoder = T5Stack(encoder_config)
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
self.decoder = T5Stack(decoder_config)
self.init_weights()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def forward(self, **kwargs):
# keyword arguments come in 3 flavors: encoder-specific (prefixed by
# `encoder_`), decoder-specific (prefixed by `decoder_`) and those
# that apply to the model as whole.
# We let the specific kwargs override the common ones in case of conflict.
kwargs_common = dict((k, v) for k, v in kwargs.items()
if not k.startswith("encoder_") and not k.startswith("decoder_"))
kwargs_encoder = kwargs_common.copy()
kwargs_decoder = kwargs_common.copy()
kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
# Encode if needed (training, first prediction pass)
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
encoder_attention_mask = kwargs_encoder.get("attention_mask", None)
if encoder_hidden_states is None:
# Convert encoder inputs in embeddings if needed
hidden_states = kwargs_encoder.pop("inputs_embeds", None)
if hidden_states is None:
encoder_inputs_ids = kwargs_encoder.pop("input_ids")
hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings
if encoder_attention_mask is not None:
# Apply masking
encoder_attention_mask = (encoder_attention_mask != 0).to(hidden_states)
hidden_states = hidden_states * encoder_attention_mask.unsqueeze(-1)
encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
encoder_hidden_states = encoder_outputs[0]
else:
encoder_outputs = ()
# Decode
# Convert decoder inputs in embeddings if needed
hidden_states = kwargs_decoder.pop("inputs_embeds", None)
if hidden_states is None:
decoder_inputs_ids = kwargs_decoder.pop("input_ids")
hidden_states = self.shared(decoder_inputs_ids)
kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
kwargs_decoder["encoder_attention_mask"] = encoder_attention_mask
decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
return decoder_outputs + encoder_outputs
@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
class T5WithLMHeadModel(T5PreTrainedModel):
r"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5WithLMHeadModel.from_pretrained('t5-small')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids, lm_labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
def __init__(self, config):
super(T5WithLMHeadModel, self).__init__(config)
self.model_dim = config.d_model
self.shared = nn.Embedding(config.vocab_size, config.d_model)
encoder_config = copy.deepcopy(config)
self.encoder = T5Stack(encoder_config)
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
self.decoder = T5Stack(decoder_config)
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
self.init_weights()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
def get_output_embeddings(self):
return self.lm_head
def forward(self, **kwargs):
# keyword arguments come in 3 flavors: encoder-specific (prefixed by
# `encoder_`), decoder-specific (prefixed by `decoder_`) and those
# that apply to the model as whole.
# We let the specific kwargs override the common ones in case of conflict.
lm_labels = kwargs.pop('decoder_lm_labels', None)
kwargs_common = dict((k, v) for k, v in kwargs.items()
if not k.startswith("encoder_") and not k.startswith("decoder_"))
kwargs_encoder = kwargs_common.copy()
kwargs_decoder = kwargs_common.copy()
kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
# Encode if needed (training, first prediction pass)
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
if encoder_hidden_states is None:
# Convert encoder inputs in embeddings if needed
hidden_states = kwargs_encoder.pop("inputs_embeds", None)
if hidden_states is None:
encoder_inputs_ids = kwargs_encoder.pop("input_ids")
hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings
encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
encoder_hidden_states = encoder_outputs[0]
else:
encoder_outputs = ()
# Decode
# Convert decoder inputs in embeddings if needed
hidden_states = kwargs_decoder.pop("inputs_embeds", None)
if hidden_states is None:
decoder_inputs_ids = kwargs_decoder.pop("input_ids")
hidden_states = self.shared(decoder_inputs_ids)
kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
sequence_output = decoder_outputs[0]
# Rescale output before projecting on vocab
# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
sequence_output = sequence_output * (self.model_dim ** -0.5)
lm_logits = self.lm_head(sequence_output)
decoder_outputs = (lm_logits,) + decoder_outputs[1:] # Add hidden states and attention if they are here
if lm_labels is not None:
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = lm_labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
decoder_outputs = (loss,) + decoder_outputs # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
return decoder_outputs + encoder_outputs
...@@ -27,6 +27,7 @@ from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceC ...@@ -27,6 +27,7 @@ from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceC
from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel
from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
...@@ -45,6 +46,7 @@ class TFAutoModel(object): ...@@ -45,6 +46,7 @@ class TFAutoModel(object):
The base model class to instantiate is selected as the first pattern matching The base model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: TFT5Model (T5 model)
- contains `distilbert`: TFDistilBertModel (DistilBERT model) - contains `distilbert`: TFDistilBertModel (DistilBERT model)
- contains `roberta`: TFRobertaModel (RoBERTa model) - contains `roberta`: TFRobertaModel (RoBERTa model)
- contains `bert`: TFBertModel (Bert model) - contains `bert`: TFBertModel (Bert model)
...@@ -68,6 +70,7 @@ class TFAutoModel(object): ...@@ -68,6 +70,7 @@ class TFAutoModel(object):
The model class to instantiate is selected as the first pattern matching The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: TFT5Model (T5 model)
- contains `distilbert`: TFDistilBertModel (DistilBERT model) - contains `distilbert`: TFDistilBertModel (DistilBERT model)
- contains `roberta`: TFRobertaModel (RoBERTa model) - contains `roberta`: TFRobertaModel (RoBERTa model)
- contains `bert`: TFTFBertModel (Bert model) - contains `bert`: TFTFBertModel (Bert model)
...@@ -137,7 +140,9 @@ class TFAutoModel(object): ...@@ -137,7 +140,9 @@ class TFAutoModel(object):
model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
""" """
if 'distilbert' in pretrained_model_name_or_path: if 't5' in pretrained_model_name_or_path:
return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path: elif 'roberta' in pretrained_model_name_or_path:
return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
...@@ -173,6 +178,7 @@ class TFAutoModelWithLMHead(object): ...@@ -173,6 +178,7 @@ class TFAutoModelWithLMHead(object):
The model class to instantiate is selected as the first pattern matching The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: TFT5WithLMHeadModel (T5 model)
- contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model) - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
- contains `roberta`: TFRobertaForMaskedLM (RoBERTa model) - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
- contains `bert`: TFBertForMaskedLM (Bert model) - contains `bert`: TFBertForMaskedLM (Bert model)
...@@ -199,6 +205,7 @@ class TFAutoModelWithLMHead(object): ...@@ -199,6 +205,7 @@ class TFAutoModelWithLMHead(object):
The model class to instantiate is selected as the first pattern matching The model class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `t5`: TFT5WithLMHeadModel (T5 model)
- contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model) - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
- contains `roberta`: TFRobertaForMaskedLM (RoBERTa model) - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
- contains `bert`: TFBertForMaskedLM (Bert model) - contains `bert`: TFBertForMaskedLM (Bert model)
...@@ -269,7 +276,9 @@ class TFAutoModelWithLMHead(object): ...@@ -269,7 +276,9 @@ class TFAutoModelWithLMHead(object):
model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
""" """
if 'distilbert' in pretrained_model_name_or_path: if 't5' in pretrained_model_name_or_path:
return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'roberta' in pretrained_model_name_or_path: elif 'roberta' in pretrained_model_name_or_path:
return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
......
...@@ -78,6 +78,7 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i ...@@ -78,6 +78,7 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i
logger.info("Loading PyTorch weights from {}".format(pt_path)) logger.info("Loading PyTorch weights from {}".format(pt_path))
pt_state_dict = torch.load(pt_path, map_location='cpu') pt_state_dict = torch.load(pt_path, map_location='cpu')
logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values())))
return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys) return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
...@@ -134,7 +135,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a ...@@ -134,7 +135,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
start_prefix_to_remove = tf_model.base_model_prefix + '.' start_prefix_to_remove = tf_model.base_model_prefix + '.'
symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
tf_loaded_numel = 0
weight_value_tuples = [] weight_value_tuples = []
all_pytorch_weights = set(list(pt_state_dict.keys())) all_pytorch_weights = set(list(pt_state_dict.keys()))
for symbolic_weight in symbolic_weights: for symbolic_weight in symbolic_weights:
...@@ -159,7 +160,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a ...@@ -159,7 +160,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
e.args += (symbolic_weight.shape, array.shape) e.args += (symbolic_weight.shape, array.shape)
raise e raise e
logger.info("Initialize TF weight {}".format(symbolic_weight.name)) tf_loaded_numel += array.size
# logger.warning("Initialize TF weight {}".format(symbolic_weight.name))
weight_value_tuples.append((symbolic_weight, array)) weight_value_tuples.append((symbolic_weight, array))
all_pytorch_weights.discard(name) all_pytorch_weights.discard(name)
...@@ -169,6 +171,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a ...@@ -169,6 +171,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
if tf_inputs is not None: if tf_inputs is not None:
tfo = tf_model(tf_inputs, training=False) # Make sure restore ops are run tfo = tf_model(tf_inputs, training=False) # Make sure restore ops are run
logger.info("Loaded {:,} parameters in the TF 2.0 model.".format(tf_loaded_numel))
logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights)) logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights))
return tf_model return tf_model
...@@ -272,7 +276,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F ...@@ -272,7 +276,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
e.args += (pt_weight.shape, array.shape) e.args += (pt_weight.shape, array.shape)
raise e raise e
logger.info("Initialize PyTorch weight {}".format(pt_weight_name)) # logger.warning("Initialize PyTorch weight {}".format(pt_weight_name))
new_pt_params_dict[pt_weight_name] = torch.from_numpy(array) new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array) loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)
......
# coding=utf-8
# Copyright 2018 T5 Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 T5 model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import math
import copy
import itertools
import tensorflow as tf
from .configuration_t5 import T5Config
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
logger = logging.getLogger(__name__)
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5",
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5",
}
####################################################
# TF 2.0 Models are constructed using Keras imperative API by sub-classing
# - tf.keras.layers.Layer for the layers and
# - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
####################################################
class TFT5LayerNorm(tf.keras.layers.Layer):
def __init__(self, epsilon=1e-6, **kwargs):
""" Construct a layernorm module in the T5 style
No bias and no substraction of mean.
"""
super(TFT5LayerNorm, self).__init__(**kwargs)
self.variance_epsilon = epsilon
def build(self, input_shape):
"""Build shared word embedding layer """
self.weight = self.add_weight(
"weight",
shape=(input_shape[-1],),
initializer='ones')
super(TFT5LayerNorm, self).build(input_shape)
def call(self, x):
variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True)
x = x * tf.math.rsqrt(variance + self.variance_epsilon)
return self.weight * x
class TFT5DenseReluDense(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFT5DenseReluDense, self).__init__(**kwargs)
self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name='wi')
self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name='wo')
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
self.act = tf.keras.activations.relu
def call(self, hidden_states, training=False):
h = self.wi(hidden_states)
h = self.act(h)
h = self.dropout(h, training=training)
h = self.wo(h)
return h
class TFT5LayerFF(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFT5LayerFF, self).__init__(**kwargs)
self.DenseReluDense = TFT5DenseReluDense(config, name='DenseReluDense')
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
name='layer_norm')
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
def call(self, hidden_states, training=False):
norm_x = self.layer_norm(hidden_states)
y = self.DenseReluDense(norm_x, training=training)
layer_output = hidden_states + self.dropout(y, training=training)
return layer_output
class TFT5Attention(tf.keras.layers.Layer):
NEW_ID = itertools.count()
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super(TFT5Attention, self).__init__(**kwargs)
self.layer_id = next(TFT5Attention.NEW_ID)
self.is_decoder = config.is_decoder
self.has_relative_attention_bias = has_relative_attention_bias
self.output_attentions = config.output_attentions
self.relative_attention_num_buckets = config.relative_attention_num_buckets
self.d_model = config.d_model
self.d_kv = config.d_kv
self.n_heads = config.num_heads
self.inner_dim = self.n_heads * self.d_kv
# Mesh TensorFlow initialization to avoid scaling before softmax
self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='q')
self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='k')
self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='v')
self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name='o')
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
if self.has_relative_attention_bias:
self.relative_attention_bias = tf.keras.layers.Embedding(self.relative_attention_num_buckets,
self.n_heads,
name='relative_attention_bias')
self.pruned_heads = set()
def prune_heads(self, heads):
raise NotImplementedError
@staticmethod
def _relative_position_bucket(relative_position,
bidirectional=True,
num_buckets=32,
max_distance=128):
"""
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
Translate relative position to a bucket number for relative attention.
The relative position is defined as memory_position - query_position, i.e.
the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are
invalid.
We use smaller buckets for small absolute relative_position and larger buckets
for larger absolute relative_positions. All relative positions >=max_distance
map to the same bucket. All relative positions <=-max_distance map to the
same bucket. This should allow for more graceful generalization to longer
sequences than the model has been trained on.
Args:
relative_position: an int32 Tensor
bidirectional: a boolean - whether the attention is bidirectional
num_buckets: an integer
max_distance: an integer
Returns:
a Tensor with the same shape as relative_position, containing int32
values in the range [0, num_buckets)
"""
ret = 0
n = -relative_position
if bidirectional:
num_buckets //= 2
ret += tf.dtypes.cast(tf.math.less(n, 0), tf.int32) * num_buckets
n = tf.math.abs(n)
else:
n = tf.math.maximum(n, 0)
# now n is in the range [0, inf)
max_exact = num_buckets // 2
is_small = tf.math.less(n, max_exact)
val_if_large = max_exact + tf.dtypes.cast(
tf.math.log(tf.dtypes.cast(n, tf.float32) / max_exact)
/ math.log(max_distance / max_exact) * (num_buckets - max_exact), tf.int32)
val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
ret += tf.where(is_small, n, val_if_large)
return ret
def compute_bias(self, qlen, klen):
""" Compute binned relative position bias """
context_position = tf.range(qlen)[:, None]
memory_position = tf.range(klen)[None, :]
relative_position = memory_position - context_position # shape (qlen, klen)
rp_bucket = self._relative_position_bucket(relative_position,
bidirectional=not self.is_decoder,
num_buckets=self.relative_attention_num_buckets)
values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads)
values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen)
return values
def call(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None, training=False):
"""
Self-attention (if kv is None) or attention over source sentence (provided by kv).
"""
# Input is (bs, qlen, dim)
# Mask is (bs, klen) (non-causal) or (bs, klen, klen)
bs, qlen, dim = shape_list(input)
if kv is None:
klen = qlen if cache is None else cache['slen'] + qlen
else:
klen = shape_list(kv)[1]
def shape(x):
""" projection """
return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, self.d_kv)), perm=(0, 2, 1, 3))
def unshape(x):
""" compute context """
return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.inner_dim))
q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head)
if kv is None:
k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head)
v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head)
elif cache is None or self.layer_id not in cache:
k = v = kv
k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head)
v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head)
if cache is not None:
if self.layer_id in cache:
if kv is None:
k_, v_ = cache[self.layer_id]
k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head)
v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head)
else:
k, v = cache[self.layer_id]
cache[self.layer_id] = (k, v)
# q = q / math.sqrt(dim_per_head) # No scaling in T5
# scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen)
scores = tf.einsum('bnqd,bnkd->bnqk', q, k) # (bs, n_heads, qlen, klen)
if position_bias is None:
if not self.has_relative_attention_bias:
raise ValueError("No position_bias provided and no weights to compute position_bias")
position_bias = self.compute_bias(qlen, klen)
if mask is not None:
position_bias = position_bias + mask
# mask = (mask == 0).expand_as(scores) # (bs, n_heads, qlen, klen)
# scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen)
scores += position_bias
weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen)
weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen)
# Mask heads if we want to
if head_mask is not None:
weights = weights * head_mask
context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head)
context = unshape(context) # (bs, qlen, dim)
context = self.o(context)
outputs = (context,)
if self.output_attentions:
outputs = outputs + (weights,)
if self.has_relative_attention_bias:
outputs = outputs + (position_bias,)
return outputs
class TFT5LayerSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super(TFT5LayerSelfAttention, self).__init__(**kwargs)
self.SelfAttention = TFT5Attention(config,
has_relative_attention_bias=has_relative_attention_bias,
name='SelfAttention')
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
name='layer_norm')
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
def call(self, hidden_states, attention_mask=None, position_bias=None,
head_mask=None, training=False):
norm_x = self.layer_norm(hidden_states)
attention_output = self.SelfAttention(norm_x,
mask=attention_mask,
position_bias=position_bias,
head_mask=head_mask,
training=training)
y = attention_output[0]
layer_output = hidden_states + self.dropout(y, training=training)
outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
return outputs
class TFT5LayerCrossAttention(tf.keras.layers.Layer):
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super(TFT5LayerCrossAttention, self).__init__(**kwargs)
self.EncDecAttention = TFT5Attention(config,
has_relative_attention_bias=has_relative_attention_bias,
name='EncDecAttention')
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
name='layer_norm')
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
def call(self, hidden_states, kv, attention_mask=None, position_bias=None,
head_mask=None, training=False):
norm_x = self.layer_norm(hidden_states)
attention_output = self.EncDecAttention(norm_x,
mask=attention_mask,
kv=kv,
position_bias=position_bias,
head_mask=head_mask,
training=training)
y = attention_output[0]
layer_output = hidden_states + self.dropout(y, training=training)
outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
return outputs
class TFT5Block(tf.keras.layers.Layer):
def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super(TFT5Block, self).__init__(**kwargs)
self.is_decoder = config.is_decoder
self.layer = []
self.layer.append(TFT5LayerSelfAttention(config,
has_relative_attention_bias=has_relative_attention_bias,
name='layer_._0'))
if self.is_decoder:
self.layer.append(TFT5LayerCrossAttention(config,
has_relative_attention_bias=has_relative_attention_bias,
name='layer_._1'))
self.layer.append(TFT5LayerFF(config, name='layer_._2'))
else:
self.layer.append(TFT5LayerFF(config, name='layer_._1'))
def call(self, hidden_states, attention_mask=None, position_bias=None,
encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
head_mask=None, training=False):
self_attention_outputs = self.layer[0](hidden_states,
attention_mask=attention_mask,
position_bias=position_bias,
head_mask=head_mask,
training=training)
hidden_states = self_attention_outputs[0]
outputs = self_attention_outputs[1:]
if not self.is_decoder:
hidden_states = self.layer[1](hidden_states, training=training)
else:
cross_attention_outputs = self.layer[1](hidden_states,
kv=encoder_hidden_states,
attention_mask=encoder_attention_mask,
position_bias=encoder_decoder_position_bias,
head_mask=head_mask,
training=training)
hidden_states = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:]
hidden_states = self.layer[2](hidden_states, training=training)
outputs = (hidden_states,) + outputs # add attentions if we output them
return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
####################################################
# The full model without a specific pretrained or finetuning head is
# provided as a tf.keras.layers.Layer usually called "TFT5MainLayer"
####################################################
class TFT5MainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFT5MainLayer, self).__init__(**kwargs)
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.is_decoder = config.is_decoder
self.config = config
self.num_hidden_layers = config.num_layers
self.block = [TFT5Block(config,
has_relative_attention_bias=bool(i == 0),
name='block_._{}'.format(i))
for i in range(config.num_layers)]
self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
name='final_layer_norm')
self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
def _prune_heads(self, heads_to_prune):
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
def call(self, hidden_states, attention_mask=None, encoder_hidden_states=None,
encoder_attention_mask=None, head_mask=None, training=False):
batch_size, seq_length = shape_list(hidden_states)[:2]
if attention_mask is None:
attention_mask = tf.fill((batch_size, seq_length), 1)
if self.is_decoder and encoder_attention_mask is None:
encoder_seq_length = encoder_hidden_states.shape[1]
encoder_attention_mask = tf.fill((batch_size, encoder_seq_length), 1)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
attention_mask = tf.cast(attention_mask, dtype=tf.float32)
num_dims_attention_mask = len(shape_list(attention_mask))
if num_dims_attention_mask == 3:
extended_attention_mask = attention_mask[:, None, :, :]
elif num_dims_attention_mask == 2:
# Provided a padding mask of dimensions [batch_size, seq_length]
# - if the model is a decoder, apply a causal mask in addition to the padding mask
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder:
seq_ids = tf.range(seq_length)
causal_mask = tf.less_equal(tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)),
seq_ids[None, :, None])
causal_mask = tf.cast(causal_mask, dtype=tf.float32)
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
else:
extended_attention_mask = attention_mask[:, None, None, :]
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
# T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
# Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
# extended_attention_mask = tf.math.equal(extended_attention_mask,
# tf.transpose(extended_attention_mask, perm=(-1, -2)))
extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
if self.is_decoder:
# If a 2D ou 3D attention mask is provided for the cross-attention
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32)
num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
if num_dims_encoder_attention_mask == 3:
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
if num_dims_encoder_attention_mask == 2:
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
# T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
# Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
# encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
# tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
else:
encoder_extended_attention_mask = None
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if not head_mask is None:
raise NotImplementedError
else:
head_mask = [None] * self.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers)
all_hidden_states = ()
all_attentions = ()
position_bias = None
encoder_decoder_position_bias = None
for i, layer_module in enumerate(self.block):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(hidden_states,
attention_mask=extended_attention_mask,
position_bias=position_bias,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
encoder_decoder_position_bias=encoder_decoder_position_bias,
head_mask=head_mask[i],
training=training)
hidden_states = layer_outputs[0]
if i == 0:
# We share the position biases between the layers - the first layer store them
# layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
position_bias = layer_outputs[2 if self.output_attentions else 1]
if self.is_decoder:
encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
if self.output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
hidden_states = self.final_layer_norm(hidden_states)
layer_output = self.dropout(hidden_states, training=training)
# Add last layer
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if self.output_hidden_states:
outputs = outputs + (all_hidden_states,)
if self.output_attentions:
outputs = outputs + (all_attentions,)
return outputs # last-layer hidden state, (all hidden states), (all attentions)
####################################################
# TFT5PreTrainedModel is a sub-class of tf.keras.Model
# which take care of loading and saving pretrained weights
# and various common utilities.
# Here you just need to specify a few (self-explanatory)
# pointers for your model.
####################################################
class TFT5PreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = T5Config
pretrained_model_archive_map = TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "transformer"
@property
def dummy_inputs(self):
input_ids = tf.constant(DUMMY_INPUTS)
input_mask = tf.constant(DUMMY_MASK)
dummy_inputs = {'decoder_input_ids': input_ids,
'encoder_input_ids': input_ids,
'decoder_attention_mask': input_mask}
return dummy_inputs
T5_START_DOCSTRING = r""" The T5 model was proposed in
`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
.. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
https://arxiv.org/abs/1910.10683
.. _`tf.keras.Model`:
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
Note on the model inputs:
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
T5_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs:
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
(b) For single sequences:
``tokens: [CLS] the dog is hairy . [SEP]``
T5 is a model with relative position embeddings so you should be able to pad the inputs on
the right or the left.
Indices can be obtained using :class:`transformers.T5Tokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
"without any specific head on top.",
T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
class TFT5Model(TFT5PreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the output of the last layer of the model.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import T5Tokenizer, TFT5Model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5Model.from_pretrained('t5-small')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFT5Model, self).__init__(config, *inputs, **kwargs)
self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
name='shared')
encoder_config = copy.deepcopy(config)
self.encoder = TFT5MainLayer(encoder_config, name='encoder')
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
self.decoder = TFT5MainLayer(decoder_config, name='decoder')
def get_input_embeddings(self):
return self.shared
def get_output_embeddings(self):
return self.shared
def call(self, decoder_input_ids, **kwargs):
# We allow two types of multi-inputs:
# - traditional keyword arguments in the call method
# - all the arguments provided as a dict in the first positional argument of call
# The last option is useful to use the tf.keras fit() method.
if isinstance(decoder_input_ids, dict):
kwargs.update(decoder_input_ids)
else:
kwargs['decoder_input_ids'] = decoder_input_ids
kwargs_common = dict((k, v) for k, v in kwargs.items()
if not k.startswith("encoder_") and not k.startswith("decoder_"))
kwargs_encoder = kwargs_common.copy()
kwargs_decoder = kwargs_common.copy()
kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
# Encode if needed (training, first prediction pass)
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
if encoder_hidden_states is None:
# Convert encoder inputs in embeddings if needed
hidden_states = kwargs_encoder.pop("inputs_embeds", None)
if hidden_states is None:
encoder_inputs_ids = kwargs_encoder.pop("input_ids")
hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings
encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
encoder_hidden_states = encoder_outputs[0]
else:
encoder_outputs = ()
# Decode
# Convert decoder inputs in embeddings if needed
hidden_states = kwargs_decoder.pop("inputs_embeds", None)
if hidden_states is None:
decoder_inputs_ids = kwargs_decoder.pop("input_ids")
hidden_states = self.shared(decoder_inputs_ids)
kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
return decoder_outputs + encoder_outputs
@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
class TFT5WithLMHeadModel(TFT5PreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import T5Tokenizer, TFT5WithLMHeadModel
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5WithLMHeadModel.from_pretrained('t5-small')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFT5WithLMHeadModel, self).__init__(config, *inputs, **kwargs)
self.model_dim = config.d_model
self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
name='shared')
encoder_config = copy.deepcopy(config)
self.encoder = TFT5MainLayer(encoder_config, name='encoder')
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
self.decoder = TFT5MainLayer(decoder_config, name='decoder')
def get_input_embeddings(self):
return self.shared
def get_output_embeddings(self):
return self.shared
def call(self, decoder_input_ids, **kwargs):
# We allow two types of multi-inputs:
# - traditional keyword arguments in the call method
# - all the arguments provided as a dict in the first positional argument of call
# The last option is useful to use the tf.keras fit() method.
if isinstance(decoder_input_ids, dict):
kwargs.update(decoder_input_ids)
else:
kwargs['decoder_input_ids'] = decoder_input_ids
kwargs_common = dict((k, v) for k, v in kwargs.items()
if not k.startswith("encoder_") and not k.startswith("decoder_"))
kwargs_encoder = kwargs_common.copy()
kwargs_decoder = kwargs_common.copy()
kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
# Encode if needed (training, first prediction pass)
encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
if encoder_hidden_states is None:
# Convert encoder inputs in embeddings if needed
hidden_states = kwargs_encoder.pop("inputs_embeds", None)
if hidden_states is None:
encoder_inputs_ids = kwargs_encoder.pop("input_ids")
hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings
encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
encoder_hidden_states = encoder_outputs[0]
else:
encoder_outputs = ()
# Decode
# Convert decoder inputs in embeddings if needed
hidden_states = kwargs_decoder.pop("inputs_embeds", None)
if hidden_states is None:
decoder_inputs_ids = kwargs_decoder.pop("input_ids")
hidden_states = self.shared(decoder_inputs_ids)
kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5)
lm_logits = self.shared(sequence_output, mode="linear")
decoder_outputs = (lm_logits,) + decoder_outputs[1:]
return decoder_outputs + encoder_outputs
...@@ -24,14 +24,12 @@ import os ...@@ -24,14 +24,12 @@ import os
import tensorflow as tf import tensorflow as tf
from .configuration_utils import PretrainedConfig from .configuration_utils import PretrainedConfig
from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
cached_path, hf_bucket_url, is_remote_url) cached_path, hf_bucket_url, is_remote_url)
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
class TFPreTrainedModel(tf.keras.Model): class TFPreTrainedModel(tf.keras.Model):
r""" Base class for all TF models. r""" Base class for all TF models.
...@@ -60,7 +58,7 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -60,7 +58,7 @@ class TFPreTrainedModel(tf.keras.Model):
Returns: Returns:
tf.Tensor with dummy inputs tf.Tensor with dummy inputs
""" """
return tf.constant(DUMMY_INPUTS) return {'input_ids': tf.constant(DUMMY_INPUTS)}
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs) super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
......
...@@ -460,7 +460,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel): ...@@ -460,7 +460,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
else: else:
langs_list = None langs_list = None
return [inputs_list, attns_list, langs_list] return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
XLM_START_DOCSTRING = r""" The XLM model was proposed in XLM_START_DOCSTRING = r""" The XLM model was proposed in
......
...@@ -31,12 +31,11 @@ from torch.nn import CrossEntropyLoss ...@@ -31,12 +31,11 @@ from torch.nn import CrossEntropyLoss
from torch.nn import functional as F from torch.nn import functional as F
from .configuration_utils import PretrainedConfig from .configuration_utils import PretrainedConfig
from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
cached_path, hf_bucket_url, is_remote_url) cached_path, hf_bucket_url, is_remote_url)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
try: try:
from torch.nn import Identity from torch.nn import Identity
except ImportError: except ImportError:
...@@ -72,6 +71,15 @@ class PreTrainedModel(nn.Module): ...@@ -72,6 +71,15 @@ class PreTrainedModel(nn.Module):
load_tf_weights = lambda model, config, path: None load_tf_weights = lambda model, config, path: None
base_model_prefix = "" base_model_prefix = ""
@property
def dummy_inputs(self):
""" Dummy inputs to do a forward pass in the network.
Returns:
torch.Tensor with dummy inputs
"""
return {'input_ids': torch.tensor(DUMMY_INPUTS)}
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(PreTrainedModel, self).__init__() super(PreTrainedModel, self).__init__()
if not isinstance(config, PretrainedConfig): if not isinstance(config, PretrainedConfig):
...@@ -161,8 +169,7 @@ class PreTrainedModel(nn.Module): ...@@ -161,8 +169,7 @@ class PreTrainedModel(nn.Module):
base_model.vocab_size = new_num_tokens base_model.vocab_size = new_num_tokens
# Tie weights again if needed # Tie weights again if needed
if hasattr(self, 'tie_weights'): self.tie_weights()
self.tie_weights()
return model_embeds return model_embeds
...@@ -478,8 +485,7 @@ class PreTrainedModel(nn.Module): ...@@ -478,8 +485,7 @@ class PreTrainedModel(nn.Module):
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
model.__class__.__name__, "\n\t".join(error_msgs))) model.__class__.__name__, "\n\t".join(error_msgs)))
if hasattr(model, 'tie_weights'): model.tie_weights() # make sure word embedding weights are still tied if needed
model.tie_weights() # make sure word embedding weights are still tied
# Set model in evaluation mode to desactivate DropOut modules by default # Set model in evaluation mode to desactivate DropOut modules by default
model.eval() model.eval()
......
...@@ -227,6 +227,16 @@ class XLMPreTrainedModel(PreTrainedModel): ...@@ -227,6 +227,16 @@ class XLMPreTrainedModel(PreTrainedModel):
def __init__(self, *inputs, **kwargs): def __init__(self, *inputs, **kwargs):
super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs) super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
@property
def dummy_inputs(self):
inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
if self.config.use_lang_emb and self.config.n_langs > 1:
langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
else:
langs_list = None
return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
def _init_weights(self, module): def _init_weights(self, module):
""" Initialize the weights. """ """ Initialize the weights. """
if isinstance(module, nn.Embedding): if isinstance(module, nn.Embedding):
...@@ -646,7 +656,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -646,7 +656,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
langs=langs, langs=langs,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
position_ids=position_ids, position_ids=position_ids,
lengths=lengths, lengths=lengths,
cache=cache, cache=cache,
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds) inputs_embeds=inputs_embeds)
......
...@@ -15,18 +15,30 @@ ...@@ -15,18 +15,30 @@
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import os import os
import six
import time import time
import unittest import unittest
from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError import requests
import six
from transformers.hf_api import HfApi, HfFolder, HTTPError, PresignedUrl, S3Obj
USER = "__DUMMY_TRANSFORMERS_USER__" USER = "__DUMMY_TRANSFORMERS_USER__"
PASS = "__DUMMY_TRANSFORMERS_PASS__" PASS = "__DUMMY_TRANSFORMERS_PASS__"
FILE_KEY = "Test-{}.txt".format(int(time.time())) FILES = [
FILE_PATH = os.path.join( (
os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt" "Test-{}.txt".format(int(time.time())),
) os.path.join(
os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
)
),
(
"yoyo {}.txt".format(int(time.time())), # space is intentional
os.path.join(
os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"
)
),
]
...@@ -57,15 +69,21 @@ class HfApiEndpointsTest(HfApiCommonTest): ...@@ -57,15 +69,21 @@ class HfApiEndpointsTest(HfApiCommonTest):
self.assertEqual(user, USER) self.assertEqual(user, USER)
def test_presign(self): def test_presign(self):
urls = self._api.presign(token=self._token, filename=FILE_KEY) for FILE_KEY, FILE_PATH in FILES:
self.assertIsInstance(urls, PresignedUrl) urls = self._api.presign(token=self._token, filename=FILE_KEY)
self.assertEqual(urls.type, "text/plain") self.assertIsInstance(urls, PresignedUrl)
self.assertEqual(urls.type, "text/plain")
def test_presign_and_upload(self): def test_presign_and_upload(self):
access_url = self._api.presign_and_upload( for FILE_KEY, FILE_PATH in FILES:
token=self._token, filename=FILE_KEY, filepath=FILE_PATH access_url = self._api.presign_and_upload(
) token=self._token, filename=FILE_KEY, filepath=FILE_PATH
self.assertIsInstance(access_url, six.string_types) )
self.assertIsInstance(access_url, six.string_types)
with open(FILE_PATH, 'r') as f:
body = f.read()
r = requests.get(access_url)
self.assertEqual(r.text, body)
def test_list_objs(self): def test_list_objs(self):
objs = self._api.list_objs(token=self._token) objs = self._api.list_objs(token=self._token)
......
...@@ -58,7 +58,7 @@ else: ...@@ -58,7 +58,7 @@ else:
def _config_zero_init(config): def _config_zero_init(config):
configs_no_init = copy.deepcopy(config) configs_no_init = copy.deepcopy(config)
for key in configs_no_init.__dict__.keys(): for key in configs_no_init.__dict__.keys():
if '_range' in key or '_std' in key: if '_range' in key or '_std' in key or 'initializer_factor' in key:
setattr(configs_no_init, key, 0.0) setattr(configs_no_init, key, 0.0)
return configs_no_init return configs_no_init
...@@ -73,6 +73,7 @@ class CommonTestCases: ...@@ -73,6 +73,7 @@ class CommonTestCases:
test_pruning = True test_pruning = True
test_resize_embeddings = True test_resize_embeddings = True
test_head_masking = True test_head_masking = True
is_encoder_decoder = False
def test_save_load(self): def test_save_load(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
...@@ -83,6 +84,8 @@ class CommonTestCases: ...@@ -83,6 +84,8 @@ class CommonTestCases:
model.eval() model.eval()
with torch.no_grad(): with torch.no_grad():
outputs = model(**inputs_dict) outputs = model(**inputs_dict)
out_2 = outputs[0].numpy()
out_2[np.isnan(out_2)] = 0
with TemporaryDirectory() as tmpdirname: with TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname) model.save_pretrained(tmpdirname)
...@@ -93,9 +96,7 @@ class CommonTestCases: ...@@ -93,9 +96,7 @@ class CommonTestCases:
# Make sure we don't have nans # Make sure we don't have nans
out_1 = after_outputs[0].cpu().numpy() out_1 = after_outputs[0].cpu().numpy()
out_2 = outputs[0].cpu().numpy() out_1[np.isnan(out_1)] = 0
out_1 = out_1[~np.isnan(out_1)]
out_2 = out_2[~np.isnan(out_2)]
max_diff = np.amax(np.abs(out_1 - out_2)) max_diff = np.amax(np.abs(out_1 - out_2))
self.assertLessEqual(max_diff, 1e-5) self.assertLessEqual(max_diff, 1e-5)
...@@ -117,20 +118,32 @@ class CommonTestCases: ...@@ -117,20 +118,32 @@ class CommonTestCases:
model = model_class(config) model = model_class(config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0] with torch.no_grad():
self.assertEqual(first.ne(second).sum().item(), 0) first = model(**inputs_dict)[0]
second = model(**inputs_dict)[0]
out_1 = first.cpu().numpy()
out_2 = second.cpu().numpy()
out_1 = out_1[~np.isnan(out_1)]
out_2 = out_2[~np.isnan(out_2)]
max_diff = np.amax(np.abs(out_1 - out_2))
self.assertLessEqual(max_diff, 1e-5)
def test_attention_outputs(self): def test_attention_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
config.output_attentions = True config.output_attentions = True
config.output_hidden_states = False config.output_hidden_states = False
model = model_class(config) model = model_class(config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
outputs = model(**inputs_dict) with torch.no_grad():
outputs = model(**inputs_dict)
attentions = outputs[-1] attentions = outputs[-1]
self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_attentions, True)
self.assertEqual(model.config.output_hidden_states, False) self.assertEqual(model.config.output_hidden_states, False)
...@@ -138,28 +151,42 @@ class CommonTestCases: ...@@ -138,28 +151,42 @@ class CommonTestCases:
self.assertListEqual( self.assertListEqual(
list(attentions[0].shape[-3:]), list(attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, [self.model_tester.num_attention_heads,
self.model_tester.seq_length, encoder_seq_length ,
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) encoder_key_length])
out_len = len(outputs) out_len = len(outputs)
if self.is_encoder_decoder:
self.assertEqual(out_len % 2, 0)
decoder_attentions = outputs[(out_len // 2)-1]
self.assertEqual(model.config.output_attentions, True)
self.assertEqual(model.config.output_hidden_states, False)
self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(decoder_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads,
decoder_seq_length,
decoder_key_length
])
# Check attention is always last and order is fine # Check attention is always last and order is fine
config.output_attentions = True config.output_attentions = True
config.output_hidden_states = True config.output_hidden_states = True
model = model_class(config) model = model_class(config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
outputs = model(**inputs_dict) with torch.no_grad():
self.assertEqual(out_len+1, len(outputs)) outputs = model(**inputs_dict)
self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_attentions, True)
self.assertEqual(model.config.output_hidden_states, True) self.assertEqual(model.config.output_hidden_states, True)
attentions = outputs[-1] self_attentions = outputs[-1]
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual( self.assertListEqual(
list(attentions[0].shape[-3:]), list(self_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, [self.model_tester.num_attention_heads,
self.model_tester.seq_length, encoder_seq_length,
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) encoder_key_length])
def test_torchscript(self): def test_torchscript(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
...@@ -223,7 +250,6 @@ class CommonTestCases: ...@@ -223,7 +250,6 @@ class CommonTestCases:
self.assertTrue(models_equal) self.assertTrue(models_equal)
def test_headmasking(self): def test_headmasking(self):
if not self.test_head_masking: if not self.test_head_masking:
return return
...@@ -278,7 +304,6 @@ class CommonTestCases: ...@@ -278,7 +304,6 @@ class CommonTestCases:
self.assertNotEqual( self.assertNotEqual(
attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0) attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
def test_head_pruning(self): def test_head_pruning(self):
if not self.test_pruning: if not self.test_pruning:
return return
...@@ -297,7 +322,8 @@ class CommonTestCases: ...@@ -297,7 +322,8 @@ class CommonTestCases:
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-1: [0]} -1: [0]}
model.prune_heads(heads_to_prune) model.prune_heads(heads_to_prune)
outputs = model(**inputs_dict) with torch.no_grad():
outputs = model(**inputs_dict)
attentions = outputs[-1] attentions = outputs[-1]
...@@ -333,7 +359,8 @@ class CommonTestCases: ...@@ -333,7 +359,8 @@ class CommonTestCases:
model = model_class.from_pretrained(directory) model = model_class.from_pretrained(directory)
model.to(torch_device) model.to(torch_device)
outputs = model(**inputs_dict) with torch.no_grad():
outputs = model(**inputs_dict)
attentions = outputs[-1] attentions = outputs[-1]
self.assertEqual(attentions[0].shape[-3], 1) self.assertEqual(attentions[0].shape[-3], 1)
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
...@@ -362,7 +389,8 @@ class CommonTestCases: ...@@ -362,7 +389,8 @@ class CommonTestCases:
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
outputs = model(**inputs_dict) with torch.no_grad():
outputs = model(**inputs_dict)
attentions = outputs[-1] attentions = outputs[-1]
self.assertEqual(attentions[0].shape[-3], 1) self.assertEqual(attentions[0].shape[-3], 1)
...@@ -389,7 +417,8 @@ class CommonTestCases: ...@@ -389,7 +417,8 @@ class CommonTestCases:
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
outputs = model(**inputs_dict) with torch.no_grad():
outputs = model(**inputs_dict)
attentions = outputs[-1] attentions = outputs[-1]
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
...@@ -406,7 +435,8 @@ class CommonTestCases: ...@@ -406,7 +435,8 @@ class CommonTestCases:
model.to(torch_device) model.to(torch_device)
shutil.rmtree(directory) shutil.rmtree(directory)
outputs = model(**inputs_dict) with torch.no_grad():
outputs = model(**inputs_dict)
attentions = outputs[-1] attentions = outputs[-1]
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
...@@ -417,7 +447,8 @@ class CommonTestCases: ...@@ -417,7 +447,8 @@ class CommonTestCases:
heads_to_prune = {0: [0], 2: [1, 2]} heads_to_prune = {0: [0], 2: [1, 2]}
model.prune_heads(heads_to_prune) model.prune_heads(heads_to_prune)
outputs = model(**inputs_dict) with torch.no_grad():
outputs = model(**inputs_dict)
attentions = outputs[-1] attentions = outputs[-1]
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1) self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
...@@ -427,7 +458,6 @@ class CommonTestCases: ...@@ -427,7 +458,6 @@ class CommonTestCases:
self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]}) self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
def test_hidden_states_output(self): def test_hidden_states_output(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
...@@ -437,14 +467,16 @@ class CommonTestCases: ...@@ -437,14 +467,16 @@ class CommonTestCases:
model = model_class(config) model = model_class(config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
outputs = model(**inputs_dict) with torch.no_grad():
outputs = model(**inputs_dict)
hidden_states = outputs[-1] hidden_states = outputs[-1]
self.assertEqual(model.config.output_attentions, False) self.assertEqual(model.config.output_attentions, False)
self.assertEqual(model.config.output_hidden_states, True) self.assertEqual(model.config.output_hidden_states, True)
self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
self.assertListEqual( self.assertListEqual(
list(hidden_states[0].shape[-2:]), list(hidden_states[0].shape[-2:]),
[self.model_tester.seq_length, self.model_tester.hidden_size]) [self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
self.model_tester.hidden_size])
def test_resize_tokens_embeddings(self): def test_resize_tokens_embeddings(self):
original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
...@@ -550,8 +582,14 @@ class CommonTestCases: ...@@ -550,8 +582,14 @@ class CommonTestCases:
def test_inputs_embeds(self): def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict["input_ids"] if not self.is_encoder_decoder:
del inputs_dict["input_ids"] input_ids = inputs_dict["input_ids"]
del inputs_dict["input_ids"]
else:
encoder_input_ids = inputs_dict["encoder_input_ids"]
decoder_input_ids = inputs_dict["decoder_input_ids"]
del inputs_dict["encoder_input_ids"]
del inputs_dict["decoder_input_ids"]
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
...@@ -559,9 +597,14 @@ class CommonTestCases: ...@@ -559,9 +597,14 @@ class CommonTestCases:
model.eval() model.eval()
wte = model.get_input_embeddings() wte = model.get_input_embeddings()
inputs_dict["inputs_embeds"] = wte(input_ids) if not self.is_encoder_decoder:
outputs = model(**inputs_dict) inputs_dict["inputs_embeds"] = wte(input_ids)
else:
inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)
with torch.no_grad():
outputs = model(**inputs_dict)
class GPTModelTester(CommonModelTester): class GPTModelTester(CommonModelTester):
...@@ -649,9 +692,10 @@ class CommonTestCases: ...@@ -649,9 +692,10 @@ class CommonTestCases:
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
outputs = model(input_ids, position_ids, token_type_ids) with torch.no_grad():
outputs = model(input_ids, position_ids) outputs = model(input_ids, position_ids, token_type_ids)
outputs = model(input_ids) outputs = model(input_ids, position_ids)
outputs = model(input_ids)
hidden_state = outputs[0] hidden_state = outputs[0]
self.parent.assertListEqual( self.parent.assertListEqual(
...@@ -664,7 +708,8 @@ class CommonTestCases: ...@@ -664,7 +708,8 @@ class CommonTestCases:
model = self.lm_head_model_class(config) model = self.lm_head_model_class(config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
outputs = model(input_ids, position_ids, token_type_ids, lm_labels) with torch.no_grad():
outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
loss, lm_logits = outputs[:2] loss, lm_logits = outputs[:2]
total_voc = self.vocab_size total_voc = self.vocab_size
...@@ -681,7 +726,8 @@ class CommonTestCases: ...@@ -681,7 +726,8 @@ class CommonTestCases:
model = model_class(config) model = model_class(config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
outputs = model(input_ids) with torch.no_grad():
outputs = model(input_ids)
presents = outputs[-1] presents = outputs[-1]
self.parent.assertEqual(self.num_hidden_layers, len(presents)) self.parent.assertEqual(self.num_hidden_layers, len(presents))
self.parent.assertListEqual( self.parent.assertListEqual(
...@@ -694,7 +740,8 @@ class CommonTestCases: ...@@ -694,7 +740,8 @@ class CommonTestCases:
model = self.double_head_model_class(config) model = self.double_head_model_class(config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels, with torch.no_grad():
outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
token_type_ids=token_type_ids, position_ids=position_ids) token_type_ids=token_type_ids, position_ids=position_ids)
lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4] lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
loss = [lm_loss, mc_loss] loss = [lm_loss, mc_loss]
......
# coding=utf-8
# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import shutil
from transformers import is_torch_available
from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_torch, slow, torch_device
if is_torch_available():
from transformers import (T5Config, T5Model, T5WithLMHeadModel)
from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
@require_torch
class T5ModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else ()
test_pruning = False
test_torchscript = False
test_resize_embeddings = False
is_encoder_decoder = True
class T5ModelTester(object):
def __init__(self,
parent,
batch_size=13,
encoder_seq_length=7,
decoder_seq_length=9,
is_training=True,
use_attention_mask=True,
use_labels=True,
vocab_size=99,
n_positions=14,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
d_ff=37,
relative_attention_num_buckets=8,
dropout_rate=0.1,
initializer_factor=0.002,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.encoder_seq_length = encoder_seq_length
self.decoder_seq_length = decoder_seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.n_positions = n_positions
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.d_ff = d_ff
self.relative_attention_num_buckets = relative_attention_num_buckets
self.dropout_rate = dropout_rate
self.initializer_factor = initializer_factor
self.scope = scope
def prepare_config_and_inputs(self):
encoder_input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
encoder_attention_mask = None
decoder_attention_mask = None
if self.use_attention_mask:
encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
decoder_lm_labels = None
if self.use_labels:
decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
config = T5Config(
vocab_size_or_config_json_file=self.vocab_size,
n_positions=self.n_positions,
d_model=self.hidden_size,
d_ff=self.d_ff,
d_kv=self.hidden_size // self.num_attention_heads,
num_layers=self.num_hidden_layers,
num_heads=self.num_attention_heads,
relative_attention_num_buckets=self.relative_attention_num_buckets,
dropout_rate=self.dropout_rate,
initializer_factor=self.initializer_factor)
return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels)
def check_loss_output(self, result):
self.parent.assertListEqual(
list(result["loss"].size()),
[])
def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
model = T5Model(config=config)
model.eval()
decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
decoder_input_ids=decoder_input_ids,
encoder_attention_mask=encoder_attention_mask,
decoder_attention_mask=decoder_attention_mask)
decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
decoder_input_ids=decoder_input_ids)
result = {
"encoder_output": encoder_output,
"decoder_output": decoder_output,
}
self.parent.assertListEqual(
list(result["encoder_output"].size()),
[self.batch_size, self.encoder_seq_length, self.hidden_size])
self.parent.assertListEqual(
list(result["decoder_output"].size()),
[self.batch_size, self.decoder_seq_length, self.hidden_size])
def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
model = T5WithLMHeadModel(config=config)
model.eval()
outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels)
loss, prediction_scores = outputs[0], outputs[1]
result = {
"loss": loss,
"prediction_scores": prediction_scores,
}
self.parent.assertListEqual(
list(result["prediction_scores"].size()),
[self.batch_size, self.decoder_seq_length, self.vocab_size])
self.check_loss_output(result)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, encoder_input_ids, decoder_input_ids, encoder_attention_mask,
decoder_attention_mask, decoder_lm_labels) = config_and_inputs
inputs_dict = {'encoder_input_ids': encoder_input_ids,
'decoder_input_ids': decoder_input_ids,
'decoder_attention_mask': decoder_attention_mask,
'encoder_attention_mask': encoder_attention_mask}
return config, inputs_dict
def setUp(self):
self.model_tester = T5ModelTest.T5ModelTester(self)
self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_t5_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_t5_model(*config_and_inputs)
def test_with_lm_head(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = T5Model.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__":
unittest.main()
...@@ -69,6 +69,7 @@ class TFCommonTestCases: ...@@ -69,6 +69,7 @@ class TFCommonTestCases:
test_torchscript = True test_torchscript = True
test_pruning = True test_pruning = True
test_resize_embeddings = True test_resize_embeddings = True
is_encoder_decoder = False
def test_initialization(self): def test_initialization(self):
pass pass
...@@ -129,8 +130,12 @@ class TFCommonTestCases: ...@@ -129,8 +130,12 @@ class TFCommonTestCases:
for name, key in inputs_dict.items()) for name, key in inputs_dict.items())
with torch.no_grad(): with torch.no_grad():
pto = pt_model(**pt_inputs_dict) pto = pt_model(**pt_inputs_dict)
tfo = tf_model(inputs_dict) tfo = tf_model(inputs_dict, training=False)
max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy())) tf_hidden_states = tfo[0].numpy()
pt_hidden_states = pto[0].numpy()
tf_hidden_states[np.isnan(tf_hidden_states)] = 0
pt_hidden_states[np.isnan(pt_hidden_states)] = 0
max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
self.assertLessEqual(max_diff, 2e-2) self.assertLessEqual(max_diff, 2e-2)
# Check we can load pt model in tf and vice-versa with checkpoint => model functions # Check we can load pt model in tf and vice-versa with checkpoint => model functions
...@@ -150,13 +155,21 @@ class TFCommonTestCases: ...@@ -150,13 +155,21 @@ class TFCommonTestCases:
with torch.no_grad(): with torch.no_grad():
pto = pt_model(**pt_inputs_dict) pto = pt_model(**pt_inputs_dict)
tfo = tf_model(inputs_dict) tfo = tf_model(inputs_dict)
max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy())) tfo = tfo[0].numpy()
pto = pto[0].numpy()
tfo[np.isnan(tfo)] = 0
pto[np.isnan(pto)] = 0
max_diff = np.amax(np.abs(tfo - pto))
self.assertLessEqual(max_diff, 2e-2) self.assertLessEqual(max_diff, 2e-2)
def test_compile_tf_model(self): def test_compile_tf_model(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32') if self.is_encoder_decoder:
input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'),
'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')}
else:
input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
...@@ -189,7 +202,7 @@ class TFCommonTestCases: ...@@ -189,7 +202,7 @@ class TFCommonTestCases:
outputs_dict = model(inputs_dict) outputs_dict = model(inputs_dict)
inputs_keywords = copy.deepcopy(inputs_dict) inputs_keywords = copy.deepcopy(inputs_dict)
input_ids = inputs_keywords.pop('input_ids') input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None)
outputs_keywords = model(input_ids, **inputs_keywords) outputs_keywords = model(input_ids, **inputs_keywords)
output_dict = outputs_dict[0].numpy() output_dict = outputs_dict[0].numpy()
...@@ -200,6 +213,11 @@ class TFCommonTestCases: ...@@ -200,6 +213,11 @@ class TFCommonTestCases:
def test_attention_outputs(self): def test_attention_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
config.output_attentions = True config.output_attentions = True
config.output_hidden_states = False config.output_hidden_states = False
...@@ -212,16 +230,28 @@ class TFCommonTestCases: ...@@ -212,16 +230,28 @@ class TFCommonTestCases:
self.assertListEqual( self.assertListEqual(
list(attentions[0].shape[-3:]), list(attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, [self.model_tester.num_attention_heads,
self.model_tester.seq_length, encoder_seq_length,
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) encoder_key_length])
out_len = len(outputs) out_len = len(outputs)
if self.is_encoder_decoder:
self.assertEqual(out_len % 2, 0)
decoder_attentions = outputs[(out_len // 2)-1]
self.assertEqual(model.config.output_attentions, True)
self.assertEqual(model.config.output_hidden_states, False)
self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(decoder_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads,
decoder_seq_length,
decoder_key_length])
# Check attention is always last and order is fine # Check attention is always last and order is fine
config.output_attentions = True config.output_attentions = True
config.output_hidden_states = True config.output_hidden_states = True
model = model_class(config) model = model_class(config)
outputs = model(inputs_dict) outputs = model(inputs_dict)
self.assertEqual(out_len+1, len(outputs)) self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_attentions, True)
self.assertEqual(model.config.output_hidden_states, True) self.assertEqual(model.config.output_hidden_states, True)
...@@ -230,8 +260,8 @@ class TFCommonTestCases: ...@@ -230,8 +260,8 @@ class TFCommonTestCases:
self.assertListEqual( self.assertListEqual(
list(attentions[0].shape[-3:]), list(attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, [self.model_tester.num_attention_heads,
self.model_tester.seq_length, encoder_seq_length,
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) encoder_key_length])
def test_hidden_states_output(self): def test_hidden_states_output(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
...@@ -264,35 +294,53 @@ class TFCommonTestCases: ...@@ -264,35 +294,53 @@ class TFCommonTestCases:
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0] first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
self.assertTrue(tf.math.equal(first, second).numpy().all()) out_1 = first.numpy()
out_2 = second.numpy()
out_1 = out_1[~np.isnan(out_1)]
out_2 = out_2[~np.isnan(out_2)]
max_diff = np.amax(np.abs(out_1 - out_2))
self.assertLessEqual(max_diff, 1e-5)
def _get_embeds(self, wte, input_ids):
# ^^ In our TF models, the input_embeddings can take slightly different forms,
# so we try a few of them.
# We used to fall back to just synthetically creating a dummy tensor of ones:
try:
x = wte(input_ids, mode="embedding")
except:
try:
x = wte([input_ids], mode="embedding")
except:
try:
x = wte([input_ids, None, None, None], mode="embedding")
except:
if hasattr(self.model_tester, "embedding_size"):
x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
else:
x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
return x
def test_inputs_embeds(self): def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict["input_ids"] if not self.is_encoder_decoder:
del inputs_dict["input_ids"] input_ids = inputs_dict["input_ids"]
del inputs_dict["input_ids"]
else:
encoder_input_ids = inputs_dict["encoder_input_ids"]
decoder_input_ids = inputs_dict["decoder_input_ids"]
del inputs_dict["encoder_input_ids"]
del inputs_dict["decoder_input_ids"]
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
wte = model.get_input_embeddings() wte = model.get_input_embeddings()
try: if not self.is_encoder_decoder:
x = wte(input_ids, mode="embedding") inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids)
except: else:
try: inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
x = wte([input_ids], mode="embedding") inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)
except:
try:
x = wte([input_ids, None, None, None], mode="embedding")
except:
if hasattr(self.model_tester, "embedding_size"):
x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
else:
x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
# ^^ In our TF models, the input_embeddings can take slightly different forms,
# so we try a few of them.
# We used to fall back to just synthetically creating a dummy tensor of ones:
#
inputs_dict["inputs_embeds"] = x
outputs = model(inputs_dict) outputs = model(inputs_dict)
......
# coding=utf-8
# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import shutil
import sys
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_tf, slow
from transformers import T5Config, is_tf_available
if is_tf_available():
import tensorflow as tf
from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
@require_tf
class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
is_encoder_decoder = True
all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else ()
class TFT5ModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_labels=True,
vocab_size=99,
n_positions=14,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
d_ff=37,
relative_attention_num_buckets=8,
dropout_rate=0.1,
initializer_factor=0.002,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.n_positions = n_positions
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.d_ff = d_ff
self.relative_attention_num_buckets = relative_attention_num_buckets
self.dropout_rate = dropout_rate
self.initializer_factor = initializer_factor
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
token_labels = None
if self.use_labels:
token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
config = T5Config(
vocab_size_or_config_json_file=self.vocab_size,
n_positions=self.n_positions,
d_model=self.hidden_size,
d_ff=self.d_ff,
d_kv=self.hidden_size // self.num_attention_heads,
num_layers=self.num_hidden_layers,
num_heads=self.num_attention_heads,
relative_attention_num_buckets=self.relative_attention_num_buckets,
dropout_rate=self.dropout_rate,
initializer_factor=self.initializer_factor)
return (config, input_ids, input_mask, token_labels)
def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
model = TFT5Model(config=config)
inputs = {'encoder_input_ids': input_ids,
'decoder_input_ids': input_ids,
'decoder_attention_mask': input_mask}
encoder_output, decoder_output = model(inputs)
encoder_output, decoder_output = model(input_ids,
decoder_attention_mask=input_mask,
encoder_input_ids=input_ids)
result = {
"encoder_output": encoder_output.numpy(),
"decoder_output": decoder_output.numpy(),
}
self.parent.assertListEqual(
list(result["encoder_output"].shape),
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertListEqual(
list(result["decoder_output"].shape),
[self.batch_size, self.seq_length, self.hidden_size])
def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
model = TFT5WithLMHeadModel(config=config)
inputs = {'encoder_input_ids': input_ids,
'decoder_input_ids': input_ids,
'decoder_attention_mask': input_mask}
prediction_scores, decoder_output = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape),
[self.batch_size, self.seq_length, self.vocab_size])
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, input_mask, token_labels) = config_and_inputs
inputs_dict = {'encoder_input_ids': input_ids,
'decoder_input_ids': input_ids,
'decoder_attention_mask': input_mask}
return config, inputs_dict
def setUp(self):
self.model_tester = TFT5ModelTest.TFT5ModelTester(self)
self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_t5_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_t5_model(*config_and_inputs)
def test_with_lm_head(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in ['t5-small']:
model = TFT5Model.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__":
unittest.main()
...@@ -67,7 +67,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester): ...@@ -67,7 +67,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
self.batch_size = batch_size self.batch_size = batch_size
self.seq_length = seq_length self.seq_length = seq_length
self.mem_len = mem_len self.mem_len = mem_len
self.key_len = seq_length + mem_len self.key_length = seq_length + mem_len
self.clamp_len = clamp_len self.clamp_len = clamp_len
self.is_training = is_training self.is_training = is_training
self.use_labels = use_labels self.use_labels = use_labels
......
...@@ -66,7 +66,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester): ...@@ -66,7 +66,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
self.batch_size = batch_size self.batch_size = batch_size
self.seq_length = seq_length self.seq_length = seq_length
self.mem_len = mem_len self.mem_len = mem_len
self.key_len = seq_length + mem_len self.key_length = seq_length + mem_len
self.clamp_len = clamp_len self.clamp_len = clamp_len
self.is_training = is_training self.is_training = is_training
self.use_labels = use_labels self.use_labels = use_labels
......
...@@ -139,5 +139,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -139,5 +139,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert encoded_sentence == [101] + text + [102] assert encoded_sentence == [101] + text + [102]
assert encoded_pair == [101] + text + [102] + text_2 + [102] assert encoded_pair == [101] + text + [102] + text_2 + [102]
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment