Commit a75c64d8 authored by Lysandre's avatar Lysandre
Browse files

Black 20 release

parent e78c1103
......@@ -83,7 +83,7 @@ def create_sinusoidal_embeddings(n_pos, dim, out):
def gelu(x):
""" Gaussian Error Linear Unit.
"""Gaussian Error Linear Unit.
Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
......@@ -333,9 +333,9 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
raise NotImplementedError
def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
raise NotImplementedError
......@@ -516,8 +516,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
class TFXLMPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""
config_class = XLMConfig
......@@ -858,7 +858,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
@property
def dummy_inputs(self):
""" Dummy inputs to build the network.
"""Dummy inputs to build the network.
Returns:
tf.Tensor with dummy inputs
......
......@@ -77,7 +77,8 @@ class TFXLMRobertaModel(TFRobertaModel):
@add_start_docstrings(
"""XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
"""XLM-RoBERTa Model with a `language modeling` head on top. """,
XLM_ROBERTA_START_DOCSTRING,
)
class TFXLMRobertaForMaskedLM(TFRobertaForMaskedLM):
"""
......
......@@ -62,9 +62,9 @@ TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
def gelu(x):
""" Implementation of the gelu activation function.
XLNet is using OpenAI GPT's gelu
Also see https://arxiv.org/abs/1606.08415
"""Implementation of the gelu activation function.
XLNet is using OpenAI GPT's gelu
Also see https://arxiv.org/abs/1606.08415
"""
cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
......@@ -807,8 +807,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
class TFXLNetPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""
config_class = XLNetConfig
......@@ -1213,33 +1213,33 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
training=False,
):
r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return:
Return:
Examples::
Examples::
import tensorflow as tf
import numpy as np
from transformers import XLNetTokenizer, TFXLNetLMHeadModel
import tensorflow as tf
import numpy as np
from transformers import XLNetTokenizer, TFXLNetLMHeadModel
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :] # We will predict the masked token
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :] # We will predict the masked token
perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
target_mapping = np.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
target_mapping = np.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))
outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
"""
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
......@@ -1401,7 +1401,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
@property
def dummy_inputs(self):
""" Dummy inputs to build the network.
"""Dummy inputs to build the network.
Returns:
tf.Tensor with dummy inputs
......
......@@ -45,8 +45,8 @@ TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
def build_tf_to_pytorch_map(model, config):
""" A map of modules from TF to PyTorch.
This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
"""A map of modules from TF to PyTorch.
This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
"""
tf_to_pt_map = {}
......@@ -112,8 +112,7 @@ def build_tf_to_pytorch_map(model, config):
def load_tf_weights_in_transfo_xl(model, config, tf_path):
""" Load tf checkpoints in a pytorch model
"""
"""Load tf checkpoints in a pytorch model"""
try:
import numpy as np
import tensorflow as tf
......@@ -386,7 +385,12 @@ class RelPartialLearnableDecoderLayer(nn.Module):
def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None, output_attentions=False):
attn_outputs = self.dec_attn(
dec_inp, r, attn_mask=dec_attn_mask, mems=mems, head_mask=head_mask, output_attentions=output_attentions,
dec_inp,
r,
attn_mask=dec_attn_mask,
mems=mems,
head_mask=head_mask,
output_attentions=output_attentions,
)
ff_output = self.pos_ff(attn_outputs[0])
......@@ -456,8 +460,8 @@ class AdaptiveEmbedding(nn.Module):
class TransfoXLPreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""
config_class = TransfoXLConfig
......@@ -474,8 +478,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
nn.init.constant_(bias, 0.0)
def _init_weights(self, m):
""" Initialize the weights.
"""
"""Initialize the weights."""
classname = m.__class__.__name__
if classname.find("Linear") != -1:
if hasattr(m, "weight") and m.weight is not None:
......@@ -515,7 +518,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
self._init_bias(m.r_bias)
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, layer: Optional[int] = -1):
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
"""Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
Arguments:
......@@ -948,7 +951,10 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
return TransfoXLModelOutput(
last_hidden_state=core_out, mems=new_mems, hidden_states=hids, attentions=attentions,
last_hidden_state=core_out,
mems=new_mems,
hidden_states=hids,
attentions=attentions,
)
......@@ -1064,8 +1070,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
)
def get_output_embeddings(self):
""" Double-check if you are using adaptive softmax.
"""
"""Double-check if you are using adaptive softmax."""
if self.sample_softmax > 0:
return self.out_layer
else:
......
......@@ -85,17 +85,17 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
def forward(self, hidden, labels=None, keep_order=False):
"""
Params:
hidden :: [len*bsz x d_proj]
labels :: [len*bsz]
Return:
if labels is None:
out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
else:
out :: [(len-1)*bsz] Negative log likelihood
We could replace this implementation by the native PyTorch one
if their's had an option to set bias on all clusters in the native one.
here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
Params:
hidden :: [len*bsz x d_proj]
labels :: [len*bsz]
Return:
if labels is None:
out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
else:
out :: [(len-1)*bsz] Negative log likelihood
We could replace this implementation by the native PyTorch one
if their's had an option to set bias on all clusters in the native one.
here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
"""
if labels is not None:
......@@ -191,7 +191,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
return out
def log_prob(self, hidden):
r""" Computes log probabilities for all :math:`n\_classes`
r"""Computes log probabilities for all :math:`n\_classes`
From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
Args:
hidden (Tensor): a minibatch of examples
......
......@@ -51,8 +51,7 @@ try:
except ImportError:
# Older PyTorch compatibility
class Identity(nn.Module):
r"""A placeholder identity operator that is argument-insensitive.
"""
r"""A placeholder identity operator that is argument-insensitive."""
def __init__(self, *args, **kwargs):
super().__init__()
......@@ -488,8 +487,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
)
def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
""" Tie or clone module weights depending of whether we are using TorchScript or not
"""
"""Tie or clone module weights depending of whether we are using TorchScript or not"""
if self.config.torchscript:
output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
else:
......@@ -498,7 +496,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
if getattr(output_embeddings, "bias", None) is not None:
output_embeddings.bias.data = torch.nn.functional.pad(
output_embeddings.bias.data,
(0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],),
(
0,
output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],
),
"constant",
0,
)
......@@ -906,7 +907,13 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
def load(module: nn.Module, prefix=""):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs,
state_dict,
prefix,
local_metadata,
True,
missing_keys,
unexpected_keys,
error_msgs,
)
for name, child in module._modules.items():
if child is not None:
......@@ -1242,24 +1249,24 @@ class SQuADHead(nn.Module):
return_dict: bool = False,
) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]:
"""
Args:
hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
Final hidden states of the model on the sequence tokens.
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Positions of the first token for the labeled span.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Positions of the last token for the labeled span.
cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Whether the question has a possible answer in the paragraph or not.
p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
1.0 means token should be masked.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.
Args:
hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
Final hidden states of the model on the sequence tokens.
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Positions of the first token for the labeled span.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Positions of the last token for the labeled span.
cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Whether the question has a possible answer in the paragraph or not.
p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
1.0 means token should be masked.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.
Returns:
Returns:
"""
start_logits = self.start_logits(hidden_states, p_mask=p_mask)
......@@ -1375,7 +1382,7 @@ class SequenceSummary(nn.Module):
self.summary = nn.Linear(config.hidden_size, num_classes)
activation_string = getattr(config, "summary_activation", None)
self.activation: Callable = (get_activation(activation_string) if activation_string else Identity())
self.activation: Callable = get_activation(activation_string) if activation_string else Identity()
self.first_dropout = Identity()
if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
......@@ -1409,7 +1416,11 @@ class SequenceSummary(nn.Module):
output = hidden_states.mean(dim=1)
elif self.summary_type == "cls_index":
if cls_index is None:
cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2] - 1, dtype=torch.long,)
cls_index = torch.full_like(
hidden_states[..., :1, :],
hidden_states.shape[-2] - 1,
dtype=torch.long,
)
else:
cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
......
......@@ -228,8 +228,8 @@ class TransformerFFN(nn.Module):
class XLMPreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""
config_class = XLMConfig
......@@ -462,9 +462,9 @@ class XLMModel(XLMPreTrainedModel):
self.embeddings = new_embeddings
def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.attentions[layer].prune_heads(heads)
......@@ -572,7 +572,11 @@ class XLMModel(XLMPreTrainedModel):
# self attention
attn_outputs = self.attentions[i](
tensor, attn_mask, cache=cache, head_mask=head_mask[i], output_attentions=output_attentions,
tensor,
attn_mask,
cache=cache,
head_mask=head_mask[i],
output_attentions=output_attentions,
)
attn = attn_outputs[0]
if output_attentions:
......@@ -633,8 +637,7 @@ class XLMPredLayer(nn.Module):
)
def forward(self, x, y=None):
""" Compute the loss, and optionally the scores.
"""
"""Compute the loss, and optionally the scores."""
outputs = ()
if self.asm is False:
scores = self.proj(x)
......@@ -969,38 +972,38 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
return_dict=None,
):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
Labels whether a question has an answer or no answer (SQuAD 2.0)
cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
1.0 means token should be masked. 0.0 mean token is not masked.
Returns:
Example::
>>> from transformers import XLMTokenizer, XLMForQuestionAnswering
>>> import torch
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048', return_dict=True)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs.loss
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
Labels whether a question has an answer or no answer (SQuAD 2.0)
cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
1.0 means token should be masked. 0.0 mean token is not masked.
Returns:
Example::
>>> from transformers import XLMTokenizer, XLMForQuestionAnswering
>>> import torch
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048', return_dict=True)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs.loss
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......@@ -1131,7 +1134,10 @@ class XLMForTokenClassification(XLMPreTrainedModel):
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
......
......@@ -68,7 +68,8 @@ class XLMRobertaModel(RobertaModel):
@add_start_docstrings(
"""XLM-RoBERTa Model with a `language modeling` head on top. """, XLM_ROBERTA_START_DOCSTRING,
"""XLM-RoBERTa Model with a `language modeling` head on top. """,
XLM_ROBERTA_START_DOCSTRING,
)
class XLMRobertaForMaskedLM(RobertaForMaskedLM):
"""
......
......@@ -58,9 +58,9 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
""" A map of modules from TF to PyTorch.
I use a map to keep the PyTorch model as
identical to the original PyTorch model as possible.
"""A map of modules from TF to PyTorch.
I use a map to keep the PyTorch model as
identical to the original PyTorch model as possible.
"""
tf_to_pt_map = {}
......@@ -141,8 +141,7 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
def load_tf_weights_in_xlnet(model, config, tf_path):
""" Load tf checkpoints in a pytorch model
"""
"""Load tf checkpoints in a pytorch model"""
try:
import numpy as np
import tensorflow as tf
......@@ -548,8 +547,8 @@ class XLNetLayer(nn.Module):
class XLNetPreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""
config_class = XLNetConfig
......@@ -557,8 +556,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
base_model_prefix = "transformer"
def _init_weights(self, module):
""" Initialize the weights.
"""
"""Initialize the weights."""
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
......@@ -1350,46 +1348,46 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
Labels for masked language modeling.
`num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
The labels should correspond to the masked input words that should be predicted and depends on `target_mapping`. Note in order to perform standard auto-regressive language modeling a `<mask>` token has to be added to the `input_ids` (see `prepare_inputs_for_generation` fn and examples below)
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored, the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Return:
Examples::
from transformers import XLNetTokenizer, XLNetLMHeadModel
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', return_dict=True)
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token
perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
# The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling.
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token
labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0)
assert labels.shape[0] == 1, 'only one word will be predicted'
perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token as is done in standard auto-regressive lm training
target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels)
loss = outputs.loss
next_token_logits = outputs.logits # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
Labels for masked language modeling.
`num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
The labels should correspond to the masked input words that should be predicted and depends on `target_mapping`. Note in order to perform standard auto-regressive language modeling a `<mask>` token has to be added to the `input_ids` (see `prepare_inputs_for_generation` fn and examples below)
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored, the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Return:
Examples::
from transformers import XLNetTokenizer, XLNetLMHeadModel
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', return_dict=True)
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token
perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
# The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling.
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token
labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0)
assert labels.shape[0] == 1, 'only one word will be predicted'
perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token as is done in standard auto-regressive lm training
target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels)
loss = outputs.loss
next_token_logits = outputs.logits # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
......@@ -1867,38 +1865,38 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
return_dict=None,
):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
Labels whether a question has an answer or no answer (SQuAD 2.0)
cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
1.0 means token should be masked. 0.0 mean token is not masked.
Returns:
Example::
>>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
>>> import torch
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
>>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased', return_dict=True)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs.loss
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
Labels whether a question has an answer or no answer (SQuAD 2.0)
cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
1.0 means token should be masked. 0.0 mean token is not masked.
Returns:
Example::
>>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
>>> import torch
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
>>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased', return_dict=True)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs.loss
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
......
......@@ -122,7 +122,9 @@ def create_optimizer(
)
if num_warmup_steps:
lr_schedule = WarmUp(
initial_learning_rate=init_lr, decay_schedule_fn=lr_schedule, warmup_steps=num_warmup_steps,
initial_learning_rate=init_lr,
decay_schedule_fn=lr_schedule,
warmup_steps=num_warmup_steps,
)
if weight_decay_rate > 0.0:
optimizer = AdamWeightDecay(
......
......@@ -208,7 +208,11 @@ class PipelineDataFormat:
SUPPORTED_FORMATS = ["json", "csv", "pipe"]
def __init__(
self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite: bool = False,
self,
output_path: Optional[str],
input_path: Optional[str],
column: Optional[str],
overwrite: bool = False,
):
self.output_path = output_path
self.input_path = input_path
......@@ -261,7 +265,11 @@ class PipelineDataFormat:
@staticmethod
def from_str(
format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
format: str,
output_path: Optional[str],
input_path: Optional[str],
column: Optional[str],
overwrite=False,
) -> "PipelineDataFormat":
"""
Creates an instance of the right subclass of :class:`~transformers.pipelines.PipelineDataFormat` depending
......@@ -305,7 +313,11 @@ class CsvPipelineDataFormat(PipelineDataFormat):
"""
def __init__(
self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
self,
output_path: Optional[str],
input_path: Optional[str],
column: Optional[str],
overwrite=False,
):
super().__init__(output_path, input_path, column, overwrite=overwrite)
......@@ -346,7 +358,11 @@ class JsonPipelineDataFormat(PipelineDataFormat):
"""
def __init__(
self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False,
self,
output_path: Optional[str],
input_path: Optional[str],
column: Optional[str],
overwrite=False,
):
super().__init__(output_path, input_path, column, overwrite=overwrite)
......@@ -610,7 +626,10 @@ class Pipeline(_ScikitCompat):
# Parse arguments
inputs = self._args_parser(*args, **kwargs)
inputs = self.tokenizer(
inputs, add_special_tokens=add_special_tokens, return_tensors=self.framework, padding=padding,
inputs,
add_special_tokens=add_special_tokens,
return_tensors=self.framework,
padding=padding,
)
return inputs
......@@ -1349,7 +1368,10 @@ class TokenClassificationPipeline(Pipeline):
with self.device_placement():
tokens = self.tokenizer(
sentence, return_attention_mask=False, return_tensors=self.framework, truncation=True,
sentence,
return_attention_mask=False,
return_tensors=self.framework,
truncation=True,
)
# Forward
......@@ -1925,7 +1947,9 @@ class SummarizationPipeline(Pipeline):
)
summaries = self.model.generate(
inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
**generate_kwargs,
)
results = []
......@@ -1935,7 +1959,9 @@ class SummarizationPipeline(Pipeline):
record["summary_token_ids"] = summary
if return_text:
record["summary_text"] = self.tokenizer.decode(
summary, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
summary,
skip_special_tokens=True,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
)
results.append(record)
return results
......@@ -2032,7 +2058,9 @@ class TranslationPipeline(Pipeline):
)
translations = self.model.generate(
inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
**generate_kwargs,
)
results = []
for translation in translations:
......@@ -2271,7 +2299,9 @@ class ConversationalPipeline(Pipeline):
"You might consider trimming the early phase of the conversation".format(input_length, max_length)
)
generated_responses = self.model.generate(
inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs,
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
**generate_kwargs,
)
cleaned_history = self._clean_padding_history(generated_responses)
......@@ -2355,7 +2385,8 @@ class ConversationalPipeline(Pipeline):
max_len = max([len(item) for item in outputs])
outputs = [output + [self.pad_token_id] * (max_len - len(output)) for output in outputs]
outputs = BatchEncoding(
{"input_ids": outputs, "attention_mask": [[1] * len(outputs)]}, tensor_type=self.framework,
{"input_ids": outputs, "attention_mask": [[1] * len(outputs)]},
tensor_type=self.framework,
)
return outputs
......
......@@ -169,7 +169,7 @@ def assert_screenout(out, what):
class CaptureStd:
""" Context manager to capture:
"""Context manager to capture:
stdout, clean it up and make it available via obj.out
stderr, and make it available via obj.err
......
......@@ -105,31 +105,31 @@ TOKENIZER_MAPPING = OrderedDict(
class AutoTokenizer:
r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class
that will be instantiated as one of the tokenizer classes of the library
when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
class method.
The `from_pretrained()` method takes care of returning the correct tokenizer class instance
based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string:
- `t5`: T5Tokenizer (T5 model)
- `distilbert`: DistilBertTokenizer (DistilBert model)
- `albert`: AlbertTokenizer (ALBERT model)
- `camembert`: CamembertTokenizer (CamemBERT model)
- `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
- `longformer`: LongformerTokenizer (AllenAI Longformer model)
- `roberta`: RobertaTokenizer (RoBERTa model)
- `bert`: BertTokenizer (Bert model)
- `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- `xlnet`: XLNetTokenizer (XLNet model)
- `xlm`: XLMTokenizer (XLM model)
- `ctrl`: CTRLTokenizer (Salesforce CTRL model)
- `electra`: ElectraTokenizer (Google ELECTRA model)
This class cannot be instantiated using `__init__()` (throw an error).
that will be instantiated as one of the tokenizer classes of the library
when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
class method.
The `from_pretrained()` method takes care of returning the correct tokenizer class instance
based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string:
- `t5`: T5Tokenizer (T5 model)
- `distilbert`: DistilBertTokenizer (DistilBert model)
- `albert`: AlbertTokenizer (ALBERT model)
- `camembert`: CamembertTokenizer (CamemBERT model)
- `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
- `longformer`: LongformerTokenizer (AllenAI Longformer model)
- `roberta`: RobertaTokenizer (RoBERTa model)
- `bert`: BertTokenizer (Bert model)
- `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- `xlnet`: XLNetTokenizer (XLNet model)
- `xlm`: XLMTokenizer (XLM model)
- `ctrl`: CTRLTokenizer (Salesforce CTRL model)
- `electra`: ElectraTokenizer (Google ELECTRA model)
This class cannot be instantiated using `__init__()` (throw an error).
"""
def __init__(self):
......@@ -140,7 +140,7 @@ class AutoTokenizer:
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
r""" Instantiate one of the tokenizer classes of the library
r"""Instantiate one of the tokenizer classes of the library
from a pre-trained model vocabulary.
The tokenizer class to instantiate is selected
......
......@@ -359,7 +359,7 @@ class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
""" Constructs a BasicTokenizer.
"""Constructs a BasicTokenizer.
Args:
**do_lower_case**: Whether to lower case the input.
......@@ -383,7 +383,7 @@ class BasicTokenizer(object):
self.strip_accents = strip_accents
def tokenize(self, text, never_split=None):
""" Basic Tokenization of a piece of text.
"""Basic Tokenization of a piece of text.
Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
Args:
......
......@@ -202,8 +202,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
return word
def _tokenize(self, text):
""" Tokenize a string.
"""
"""Tokenize a string."""
split_tokens = []
words = re.findall(r"\S+\n?", text)
......
......@@ -330,7 +330,11 @@ class CustomDPRReaderTokenizerMixin:
return nbest_spans_predictions[:num_spans]
def _get_best_spans(
self, start_logits: List[int], end_logits: List[int], max_answer_length: int, top_spans: int,
self,
start_logits: List[int],
end_logits: List[int],
max_answer_length: int,
top_spans: int,
) -> List[DPRSpanPrediction]:
"""
Finds the best answer span for the extractive Q&A model for one passage.
......
......@@ -137,9 +137,7 @@ class MarianTokenizer(PreTrainedTokenizer):
padding="longest",
**unused,
) -> BatchEncoding:
"""Prepare model inputs for translation. For best performance, translate one sentence at a time.
"""
"""Prepare model inputs for translation. For best performance, translate one sentence at a time."""
if "" in src_texts:
raise ValueError(f"found empty string in src_texts: {src_texts}")
self.current_spm = self.spm_source
......
......@@ -53,29 +53,29 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class ReformerTokenizer(PreTrainedTokenizer):
"""
Constructs an Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`string`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
Additional special tokens used by the tokenizer.
Constructs an Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`string`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
Additional special tokens used by the tokenizer.
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -142,8 +142,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
self.sp_model.Load(self.vocab_file)
def _tokenize(self, text, sample=False):
""" Take as input a string and return a list of strings (tokens) for words/sub-words
"""
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
if not sample:
pieces = self.sp_model.EncodeAsPieces(text)
else:
......@@ -166,8 +165,8 @@ class ReformerTokenizer(PreTrainedTokenizer):
return out_string
def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
......@@ -63,34 +63,34 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class T5Tokenizer(PreTrainedTokenizer):
"""
Constructs a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`string`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
extra_ids (:obj:`List[str]`, `optional`, defaults to :obj:`100`):
Add a number of extra ids added to the end of the vocabulary for use as sentinels.
These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token in the vocabulary like in T5 preprocessing
see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
Additional special tokens used by the tokenizer.
Constructs a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`string`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
extra_ids (:obj:`List[str]`, `optional`, defaults to :obj:`100`):
Add a number of extra ids added to the end of the vocabulary for use as sentinels.
These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token in the vocabulary like in T5 preprocessing
see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
Additional special tokens used by the tokenizer.
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -236,8 +236,7 @@ class T5Tokenizer(PreTrainedTokenizer):
self.sp_model.Load(self.vocab_file)
def _tokenize(self, text, sample=False):
""" Take as input a string and return a list of strings (tokens) for words/sub-words
"""
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
if not sample:
pieces = self.sp_model.EncodeAsPieces(text)
else:
......@@ -266,8 +265,8 @@ class T5Tokenizer(PreTrainedTokenizer):
return out_string
def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
......@@ -163,7 +163,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
def count_sents(self, sents, verbose=False):
"""
sents : a list of sentences, each a list of tokenized symbols
sents : a list of sentences, each a list of tokenized symbols
"""
if verbose:
logger.info("counting {} sents ...".format(len(sents)))
......@@ -496,7 +496,7 @@ class TransfoXLTokenizerFast(PreTrainedTokenizerFast):
class LMOrderedIterator(object):
def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
"""
data -- LongTensor -- the LongTensor is strictly ordered
data -- LongTensor -- the LongTensor is strictly ordered
"""
self.bsz = bsz
self.bptt = bptt
......@@ -555,7 +555,7 @@ class LMOrderedIterator(object):
class LMShuffledIterator(object):
def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
"""
data -- list[LongTensor] -- there is no order among the LongTensors
data -- list[LongTensor] -- there is no order among the LongTensors
"""
self.data = data
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment