Commit 0201d860 authored by thomwolf's avatar thomwolf
Browse files

added doc for transformer-xl

parent 4cb48945
......@@ -382,10 +382,10 @@ GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in
https://pytorch.org/docs/stable/nn.html#module
Parameters:
config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
"""
GPT2_INPUTS_DOCTRING = r""" Inputs:
GPT2_INPUTS_DOCSTRING = r""" Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
......@@ -413,7 +413,7 @@ GPT2_INPUTS_DOCTRING = r""" Inputs:
"""
@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
GPT2_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
class GPT2Model(GPT2PreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -538,7 +538,7 @@ class GPT2Model(GPT2PreTrainedModel):
@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
class GPT2LMHeadModel(GPT2PreTrainedModel):
r"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......
......@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter
from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
PreTrainedModel, prune_conv1d_layer, SequenceSummary)
PreTrainedModel, prune_conv1d_layer, SequenceSummary,
add_start_docstrings)
from .modeling_bert import BertLayerNorm as LayerNorm
logger = logging.getLogger(__name__)
......@@ -395,10 +396,10 @@ OPENAI_GPT_START_DOCSTRING = r""" OpenAI GPT model was proposed in
https://pytorch.org/docs/stable/nn.html#module
Parameters:
config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
"""
OPENAI_GPT_INPUTS_DOCTRING = r""" Inputs:
OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
......@@ -422,7 +423,7 @@ OPENAI_GPT_INPUTS_DOCTRING = r""" Inputs:
"""
@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
OPENAI_GPT_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
......@@ -532,7 +533,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCTRING)
(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
r"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
......
......@@ -36,7 +36,7 @@ from torch.nn.parameter import Parameter
from .modeling_bert import BertLayerNorm as LayerNorm
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings)
logger = logging.getLogger(__name__)
......@@ -910,23 +910,71 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
pass
class TransfoXLModel(TransfoXLPreTrainedModel):
"""Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
TRANSFO_XL_START_DOCSTRING = r""" The Transformer-XL model was proposed in
`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
previously computed hidden-states to attend to longer context (memory).
This model also uses adaptive softmax inputs and outputs (tied).
Transformer XL uses relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
refer to the PyTorch documentation for all matter related to general usage and behavior.
- you don't need to specify positioning embeddings indices.
.. _`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`:
https://arxiv.org/abs/1901.02860
- the tokens in the vocabulary have to be sorted in decreasing frequency.
.. _`torch.nn.Module`:
https://pytorch.org/docs/stable/nn.html#module
Args:
config: a TransfoXLConfig class instance with the configuration to build a new model
Parameters:
config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
"""
TRANSFO_XL_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask indices selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
Example::
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
class TransfoXLModel(TransfoXLPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Examples::
>>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
>>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
>>> model = TransfoXLModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
>>> last_hidden_states, mems = outputs[:2]
config = TransfoXLConfig()
model = TransfoXLModel(config)
"""
def __init__(self, config):
super(TransfoXLModel, self).__init__(config)
......@@ -1193,41 +1241,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return outputs # last hidden state, new_mems, (all hidden states), (all attentions)
def forward(self, input_ids, mems=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`mems`: optional memory of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Returns:
A tuple of ``(last_hidden_state, new_mems)``.
``last_hidden_state``: the encoded-hidden-states at the top of the model
as a ``torch.FloatTensor`` of size [batch_size, sequence_length, self.config.d_model]
``new_mems``: list (num layers) of updated mem states at the entry of each layer
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
``labels``
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
last_hidden_state, new_mems = model(input_ids)
# or
last_hidden_state, new_mems = model.forward(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, new_mems)
"""
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz]
input_ids = input_ids.transpose(0, 1).contiguous()
......@@ -1239,27 +1252,45 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return outputs # last hidden state, new_mems, (all hidden states), (all attentions)
@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
(adaptive softmax with weights tied to the adaptive input embeddings)""",
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
"""Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
This model adds an (adaptive) softmax head on top of the ``TransfoXLModel``
Transformer XL uses a relative positioning (with sinusoidal patterns) and adaptive softmax inputs which means that:
- you don't need to specify positioning embeddings indices
- the tokens in the vocabulary have to be sorted in decreasing frequency.
Call ``self.tie_weights()`` if you update/load the weights of the transformer to keep the weights tied.
Args:
config: a ``TransfoXLConfig`` class instance with the configuration to build a new model
r"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss.
**prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
We don't output them when the loss is computed to speedup adaptive softmax decoding.
**mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Examples::
>>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
>>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
>>> model = TransfoXLLMHeadModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_scores, mems = outputs[:2]
Example::
config = TransfoXLConfig()
model = TransfoXLModel(config)
"""
def __init__(self, config):
super(TransfoXLLMHeadModel, self).__init__(config)
......@@ -1310,44 +1341,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
return self.transformer.init_mems(data)
def forward(self, input_ids, labels=None, mems=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`labels`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the labels token indices selected in the range [0, self.config.n_token[
`mems`: an optional memory of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Returns:
A tuple of (last_hidden_state, new_mems)
``last_hidden_state``: output of the (adaptive) softmax. If ``labels`` is ``None``, it is the negative
log likelihood of shape [batch_size, sequence_length]. Otherwise, it is the log probabilities of
tokens of, shape [batch_size, sequence_length, n_tokens].
``new_mems``: list (num layers) of updated mem states at the entry of each layer
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
``labels``
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
last_hidden_state, new_mems = model(input_ids)
# or
last_hidden_state, new_mems = model.forward(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
"""
bsz = input_ids.size(0)
tgt_len = input_ids.size(1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment