Commit 9c1bdb5b authored by Rémi Louf's avatar Rémi Louf
Browse files

revert renaming of lm_labels to ltr_lm_labels

parent 098a89f3
...@@ -283,14 +283,14 @@ def evaluate(args, model, tokenizer, prefix=""): ...@@ -283,14 +283,14 @@ def evaluate(args, model, tokenizer, prefix=""):
model.eval() model.eval()
for batch in tqdm(eval_dataloader, desc="Evaluating"): for batch in tqdm(eval_dataloader, desc="Evaluating"):
source, target, encoder_token_type_ids, encoder_mask, decoder_mask, ltr_lm_labels = batch source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
source = source.to(args.device) source = source.to(args.device)
target = target.to(args.device) target = target.to(args.device)
encoder_token_type_ids = encoder_token_type_ids.to(args.device) encoder_token_type_ids = encoder_token_type_ids.to(args.device)
encoder_mask = encoder_mask.to(args.device) encoder_mask = encoder_mask.to(args.device)
decoder_mask = decoder_mask.to(args.device) decoder_mask = decoder_mask.to(args.device)
ltr_lm_labels = ltr_lm_labels.to(args.device) lm_labels = lm_labels.to(args.device)
with torch.no_grad(): with torch.no_grad():
outputs = model( outputs = model(
...@@ -299,7 +299,7 @@ def evaluate(args, model, tokenizer, prefix=""): ...@@ -299,7 +299,7 @@ def evaluate(args, model, tokenizer, prefix=""):
encoder_token_type_ids=encoder_token_type_ids, encoder_token_type_ids=encoder_token_type_ids,
encoder_attention_mask=encoder_mask, encoder_attention_mask=encoder_mask,
decoder_attention_mask=decoder_mask, decoder_attention_mask=decoder_mask,
decoder_ltr_lm_labels=ltr_lm_labels, decoder_lm_labels=lm_labels,
) )
lm_loss = outputs[0] lm_loss = outputs[0]
eval_loss += lm_loss.mean().item() eval_loss += lm_loss.mean().item()
......
...@@ -791,7 +791,7 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -791,7 +791,7 @@ class BertForMaskedLM(BertPreTrainedModel):
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
**ltr_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the left-to-right language modeling loss (next word prediction). Labels for computing the left-to-right language modeling loss (next word prediction).
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
...@@ -800,7 +800,7 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -800,7 +800,7 @@ class BertForMaskedLM(BertPreTrainedModel):
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**masked_lm_loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: **masked_lm_loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss. Masked language modeling loss.
**ltr_lm_loss**: (`optional`, returned when ``ltr_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: **ltr_lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Next token prediction loss. Next token prediction loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
...@@ -838,7 +838,7 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -838,7 +838,7 @@ class BertForMaskedLM(BertPreTrainedModel):
self.bert.embeddings.word_embeddings) self.bert.embeddings.word_embeddings)
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, ltr_lm_labels=None, ): masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ):
outputs = self.bert(input_ids, outputs = self.bert(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -857,19 +857,19 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -857,19 +857,19 @@ class BertForMaskedLM(BertPreTrainedModel):
# 1. If a tensor that contains the indices of masked labels is provided, # 1. If a tensor that contains the indices of masked labels is provided,
# the cross-entropy is the MLM cross-entropy that measures the likelihood # the cross-entropy is the MLM cross-entropy that measures the likelihood
# of predictions for masked words. # of predictions for masked words.
# 2. If `ltr_lm_labels` is provided we are in a causal scenario where we # 2. If `lm_labels` is provided we are in a causal scenario where we
# try to predict the next token for each input in the decoder. # try to predict the next token for each input in the decoder.
if masked_lm_labels is not None: if masked_lm_labels is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1) # -1 index = padding token loss_fct = CrossEntropyLoss(ignore_index=-1) # -1 index = padding token
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
outputs = (masked_lm_loss,) + outputs outputs = (masked_lm_loss,) + outputs
if ltr_lm_labels is not None: if lm_labels is not None:
# we are doing next-token prediction; shift prediction scores and input ids by one # we are doing next-token prediction; shift prediction scores and input ids by one
prediction_scores = prediction_scores[:, :-1, :].contiguous() prediction_scores = prediction_scores[:, :-1, :].contiguous()
ltr_lm_labels = ltr_lm_labels[:, 1:].contiguous() lm_labels = lm_labels[:, 1:].contiguous()
loss_fct = CrossEntropyLoss(ignore_index=-1) loss_fct = CrossEntropyLoss(ignore_index=-1)
ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), ltr_lm_labels.view(-1)) ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1))
outputs = (ltr_lm_loss,) + outputs outputs = (ltr_lm_loss,) + outputs
return outputs # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions) return outputs # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)
......
...@@ -30,10 +30,10 @@ logger = logging.getLogger(__name__) ...@@ -30,10 +30,10 @@ logger = logging.getLogger(__name__)
class PreTrainedSeq2seq(nn.Module): class PreTrainedSeq2seq(nn.Module):
r""" r"""
:class:`~transformers.PreTrainedSeq2seq` is a generic model class that will be :class:`~transformers.PreTrainedSeq2seq` is a generic model class that will be
instantiated as a Seq2seq model with one of the base model classes of instantiated as a transformer architecture with one of the base model
the library as encoder and (optionally) as decoder when created with classes of the library as encoder and (optionally) another one as
the `AutoModel.from_pretrained(pretrained_model_name_or_path)` class decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
method. class method.
""" """
def __init__(self, encoder, decoder): def __init__(self, encoder, decoder):
...@@ -59,13 +59,13 @@ class PreTrainedSeq2seq(nn.Module): ...@@ -59,13 +59,13 @@ class PreTrainedSeq2seq(nn.Module):
encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either: encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either: decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
model_args: (`optional`) Sequence of positional arguments: model_args: (`optional`) Sequence of positional arguments:
...@@ -103,7 +103,7 @@ class PreTrainedSeq2seq(nn.Module): ...@@ -103,7 +103,7 @@ class PreTrainedSeq2seq(nn.Module):
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
You can specify different kwargs for the decoder by prefixing the key with `decoder_` (e.g. ``decoder_output_attention=True``). You can specify kwargs sepcific for the encoder and decoder by prefixing the key with `encoder_` and `decoder_` respectively. (e.g. ``decoder_output_attention=True``). The remaining kwargs will be passed to both encoders and decoders.
Examples:: Examples::
...@@ -154,8 +154,11 @@ class PreTrainedSeq2seq(nn.Module): ...@@ -154,8 +154,11 @@ class PreTrainedSeq2seq(nn.Module):
return model return model
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
""" Save a Seq2Seq model and its configuration file in a format """ Save a Seq2Seq model and its configuration file in a format such
such that it can be loaded using `:func:`~transformers.PreTrainedSeq2seq.from_pretrained` """ that it can be loaded using `:func:`~transformers.PreTrainedSeq2seq.from_pretrained`
We save the encoder' and decoder's parameters in two separate directories.
"""
self.encoder.save_pretrained(os.path.join(save_directory, "encoder")) self.encoder.save_pretrained(os.path.join(save_directory, "encoder"))
self.decoder.save_pretrained(os.path.join(save_directory, "decoder")) self.decoder.save_pretrained(os.path.join(save_directory, "decoder"))
...@@ -176,6 +179,7 @@ class PreTrainedSeq2seq(nn.Module): ...@@ -176,6 +179,7 @@ class PreTrainedSeq2seq(nn.Module):
Indices of encoder input sequence tokens in the vocabulary. Indices of encoder input sequence tokens in the vocabulary.
decoder_input_ids: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)`` decoder_input_ids: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``
Indices of decoder input sequence tokens in the vocabulary. Indices of decoder input sequence tokens in the vocabulary.
kwargs: (`optional`) Remaining dictionary of keyword arguments.
""" """
# keyword arguments come in 3 flavors: encoder-specific (prefixed by # keyword arguments come in 3 flavors: encoder-specific (prefixed by
# `encoder_`), decoder-specific (prefixed by `decoder_`) and those # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment