Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
0201d860
Commit
0201d860
authored
Jul 15, 2019
by
thomwolf
Browse files
added doc for transformer-xl
parent
4cb48945
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
106 additions
and
112 deletions
+106
-112
pytorch_transformers/modeling_gpt2.py
pytorch_transformers/modeling_gpt2.py
+4
-4
pytorch_transformers/modeling_openai.py
pytorch_transformers/modeling_openai.py
+6
-5
pytorch_transformers/modeling_transfo_xl.py
pytorch_transformers/modeling_transfo_xl.py
+96
-103
No files found.
pytorch_transformers/modeling_gpt2.py
View file @
0201d860
...
@@ -382,10 +382,10 @@ GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in
...
@@ -382,10 +382,10 @@ GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in
https://pytorch.org/docs/stable/nn.html#module
https://pytorch.org/docs/stable/nn.html#module
Parameters:
Parameters:
config (:class:`~pytorch_transformers.
Bert
Config`): Model configuration class with all the parameters of the model.
config (:class:`~pytorch_transformers.
GPT2
Config`): Model configuration class with all the parameters of the model.
"""
"""
GPT2_INPUTS_DOCTRING
=
r
""" Inputs:
GPT2_INPUTS_DOC
S
TRING
=
r
""" Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
...
@@ -413,7 +413,7 @@ GPT2_INPUTS_DOCTRING = r""" Inputs:
...
@@ -413,7 +413,7 @@ GPT2_INPUTS_DOCTRING = r""" Inputs:
"""
"""
@
add_start_docstrings
(
"The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top."
,
@
add_start_docstrings
(
"The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top."
,
GPT2_START_DOCSTRING
,
GPT2_INPUTS_DOCTRING
)
GPT2_START_DOCSTRING
,
GPT2_INPUTS_DOC
S
TRING
)
class
GPT2Model
(
GPT2PreTrainedModel
):
class
GPT2Model
(
GPT2PreTrainedModel
):
r
"""
r
"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
...
@@ -538,7 +538,7 @@ class GPT2Model(GPT2PreTrainedModel):
...
@@ -538,7 +538,7 @@ class GPT2Model(GPT2PreTrainedModel):
@
add_start_docstrings
(
"""The GPT2 Model transformer with a language modeling head on top
@
add_start_docstrings
(
"""The GPT2 Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """
,
GPT2_START_DOCSTRING
,
GPT2_INPUTS_DOCTRING
)
(linear layer with weights tied to the input embeddings). """
,
GPT2_START_DOCSTRING
,
GPT2_INPUTS_DOC
S
TRING
)
class
GPT2LMHeadModel
(
GPT2PreTrainedModel
):
class
GPT2LMHeadModel
(
GPT2PreTrainedModel
):
r
"""
r
"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
...
...
pytorch_transformers/modeling_openai.py
View file @
0201d860
...
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
...
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
from
torch.nn.parameter
import
Parameter
from
torch.nn.parameter
import
Parameter
from
.modeling_utils
import
(
Conv1D
,
CONFIG_NAME
,
WEIGHTS_NAME
,
PretrainedConfig
,
from
.modeling_utils
import
(
Conv1D
,
CONFIG_NAME
,
WEIGHTS_NAME
,
PretrainedConfig
,
PreTrainedModel
,
prune_conv1d_layer
,
SequenceSummary
)
PreTrainedModel
,
prune_conv1d_layer
,
SequenceSummary
,
add_start_docstrings
)
from
.modeling_bert
import
BertLayerNorm
as
LayerNorm
from
.modeling_bert
import
BertLayerNorm
as
LayerNorm
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -395,10 +396,10 @@ OPENAI_GPT_START_DOCSTRING = r""" OpenAI GPT model was proposed in
...
@@ -395,10 +396,10 @@ OPENAI_GPT_START_DOCSTRING = r""" OpenAI GPT model was proposed in
https://pytorch.org/docs/stable/nn.html#module
https://pytorch.org/docs/stable/nn.html#module
Parameters:
Parameters:
config (:class:`~pytorch_transformers.
Bert
Config`): Model configuration class with all the parameters of the model.
config (:class:`~pytorch_transformers.
OpenAIGPT
Config`): Model configuration class with all the parameters of the model.
"""
"""
OPENAI_GPT_INPUTS_DOCTRING
=
r
""" Inputs:
OPENAI_GPT_INPUTS_DOC
S
TRING
=
r
""" Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
...
@@ -422,7 +423,7 @@ OPENAI_GPT_INPUTS_DOCTRING = r""" Inputs:
...
@@ -422,7 +423,7 @@ OPENAI_GPT_INPUTS_DOCTRING = r""" Inputs:
"""
"""
@
add_start_docstrings
(
"The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top."
,
@
add_start_docstrings
(
"The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top."
,
OPENAI_GPT_START_DOCSTRING
,
GPT
2
_INPUTS_DOCTRING
)
OPENAI_GPT_START_DOCSTRING
,
OPENAI_
GPT_INPUTS_DOC
S
TRING
)
class
OpenAIGPTModel
(
OpenAIGPTPreTrainedModel
):
class
OpenAIGPTModel
(
OpenAIGPTPreTrainedModel
):
r
"""
r
"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
...
@@ -532,7 +533,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
...
@@ -532,7 +533,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
@
add_start_docstrings
(
"""OpenAI GPT Model transformer with a language modeling head on top
@
add_start_docstrings
(
"""OpenAI GPT Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """
,
OPENAI_GPT_START_DOCSTRING
,
OPENAI_GPT_INPUTS_DOCTRING
)
(linear layer with weights tied to the input embeddings). """
,
OPENAI_GPT_START_DOCSTRING
,
OPENAI_GPT_INPUTS_DOC
S
TRING
)
class
OpenAIGPTLMHeadModel
(
OpenAIGPTPreTrainedModel
):
class
OpenAIGPTLMHeadModel
(
OpenAIGPTPreTrainedModel
):
r
"""
r
"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
...
...
pytorch_transformers/modeling_transfo_xl.py
View file @
0201d860
...
@@ -36,7 +36,7 @@ from torch.nn.parameter import Parameter
...
@@ -36,7 +36,7 @@ from torch.nn.parameter import Parameter
from
.modeling_bert
import
BertLayerNorm
as
LayerNorm
from
.modeling_bert
import
BertLayerNorm
as
LayerNorm
from
.modeling_transfo_xl_utilities
import
ProjectedAdaptiveLogSoftmax
,
sample_logits
from
.modeling_transfo_xl_utilities
import
ProjectedAdaptiveLogSoftmax
,
sample_logits
from
.modeling_utils
import
CONFIG_NAME
,
WEIGHTS_NAME
,
PretrainedConfig
,
PreTrainedModel
from
.modeling_utils
import
(
PretrainedConfig
,
PreTrainedModel
,
add_start_docstrings
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -910,23 +910,71 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
...
@@ -910,23 +910,71 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
pass
pass
class
TransfoXLModel
(
TransfoXLPreTrainedModel
):
TRANSFO_XL_START_DOCSTRING
=
r
""" The Transformer-XL model was proposed in
"""Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
previously computed hidden-states to attend to longer context (memory).
This model also uses adaptive softmax inputs and outputs (tied).
Transformer XL uses relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
refer to the PyTorch documentation for all matter related to general usage and behavior.
- you don't need to specify positioning embeddings indices.
.. _`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`:
https://arxiv.org/abs/1901.02860
- the tokens in the vocabulary have to be sorted in decreasing frequency.
.. _`torch.nn.Module`:
https://pytorch.org/docs/stable/nn.html#module
Args:
Parameters:
config: a TransfoXLConfig class instance with the configuration to build a new model
config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
"""
TRANSFO_XL_INPUTS_DOCSTRING
=
r
"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask indices selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
Example::
@
add_start_docstrings
(
"The bare Bert Model transformer outputing raw hidden-states without any specific head on top."
,
TRANSFO_XL_START_DOCSTRING
,
TRANSFO_XL_INPUTS_DOCSTRING
)
class
TransfoXLModel
(
TransfoXLPreTrainedModel
):
r
"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Examples::
>>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
>>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
>>> model = TransfoXLModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
>>> last_hidden_states, mems = outputs[:2]
config = TransfoXLConfig()
model = TransfoXLModel(config)
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
TransfoXLModel
,
self
).
__init__
(
config
)
super
(
TransfoXLModel
,
self
).
__init__
(
config
)
...
@@ -1193,41 +1241,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
...
@@ -1193,41 +1241,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return
outputs
# last hidden state, new_mems, (all hidden states), (all attentions)
return
outputs
# last hidden state, new_mems, (all hidden states), (all attentions)
def
forward
(
self
,
input_ids
,
mems
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
mems
=
None
,
head_mask
=
None
):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`mems`: optional memory of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Returns:
A tuple of ``(last_hidden_state, new_mems)``.
``last_hidden_state``: the encoded-hidden-states at the top of the model
as a ``torch.FloatTensor`` of size [batch_size, sequence_length, self.config.d_model]
``new_mems``: list (num layers) of updated mem states at the entry of each layer
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
``labels``
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
last_hidden_state, new_mems = model(input_ids)
# or
last_hidden_state, new_mems = model.forward(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, new_mems)
"""
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz]
# so we transpose here from shape [bsz, len] to shape [len, bsz]
input_ids
=
input_ids
.
transpose
(
0
,
1
).
contiguous
()
input_ids
=
input_ids
.
transpose
(
0
,
1
).
contiguous
()
...
@@ -1239,27 +1252,45 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
...
@@ -1239,27 +1252,45 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return
outputs
# last hidden state, new_mems, (all hidden states), (all attentions)
return
outputs
# last hidden state, new_mems, (all hidden states), (all attentions)
@
add_start_docstrings
(
"""The Transformer-XL Model with a language modeling head on top
(adaptive softmax with weights tied to the adaptive input embeddings)"""
,
TRANSFO_XL_START_DOCSTRING
,
TRANSFO_XL_INPUTS_DOCSTRING
)
class
TransfoXLLMHeadModel
(
TransfoXLPreTrainedModel
):
class
TransfoXLLMHeadModel
(
TransfoXLPreTrainedModel
):
"""Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
r
"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
This model adds an (adaptive) softmax head on top of the ``TransfoXLModel``
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Transformer XL uses a relative positioning (with sinusoidal patterns) and adaptive softmax inputs which means that:
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
- you don't need to specify positioning embeddings indices
computed for labels in ``[0, ..., config.vocab_size]``
- the tokens in the vocabulary have to be sorted in decreasing frequency.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Call ``self.tie_weights()`` if you update/load the weights of the transformer to keep the weights tied.
Language modeling loss.
**prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Args:
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
config: a ``TransfoXLConfig`` class instance with the configuration to build a new model
We don't output them when the loss is computed to speedup adaptive softmax decoding.
**mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Examples::
>>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
>>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
>>> model = TransfoXLLMHeadModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_scores, mems = outputs[:2]
Example::
config = TransfoXLConfig()
model = TransfoXLModel(config)
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
TransfoXLLMHeadModel
,
self
).
__init__
(
config
)
super
(
TransfoXLLMHeadModel
,
self
).
__init__
(
config
)
...
@@ -1310,44 +1341,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
...
@@ -1310,44 +1341,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
return
self
.
transformer
.
init_mems
(
data
)
return
self
.
transformer
.
init_mems
(
data
)
def
forward
(
self
,
input_ids
,
labels
=
None
,
mems
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
labels
=
None
,
mems
=
None
,
head_mask
=
None
):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`labels`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the labels token indices selected in the range [0, self.config.n_token[
`mems`: an optional memory of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Returns:
A tuple of (last_hidden_state, new_mems)
``last_hidden_state``: output of the (adaptive) softmax. If ``labels`` is ``None``, it is the negative
log likelihood of shape [batch_size, sequence_length]. Otherwise, it is the log probabilities of
tokens of, shape [batch_size, sequence_length, n_tokens].
``new_mems``: list (num layers) of updated mem states at the entry of each layer
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
``labels``
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
last_hidden_state, new_mems = model(input_ids)
# or
last_hidden_state, new_mems = model.forward(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
"""
bsz
=
input_ids
.
size
(
0
)
bsz
=
input_ids
.
size
(
0
)
tgt_len
=
input_ids
.
size
(
1
)
tgt_len
=
input_ids
.
size
(
1
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment