Unverified Commit e4f9dca0 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #773 from huggingface/doc-sphinx

Sphinx doc, XLM Checkpoints
parents d216e798 b87eb82b
...@@ -23,7 +23,8 @@ from io import open ...@@ -23,7 +23,8 @@ from io import open
import torch import torch
import numpy import numpy
from pytorch_transformers.modeling_xlm import (CONFIG_NAME, WEIGHTS_NAME, XLMConfig, XLMModel) from pytorch_transformers.modeling_utils import CONFIG_NAME, WEIGHTS_NAME
from pytorch_transformers.modeling_xlm import (XLMConfig, XLMModel)
from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
...@@ -37,7 +38,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p ...@@ -37,7 +38,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.Tensor, numpy.ndarray))) config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.Tensor, numpy.ndarray)))
vocab = chkpt['dico_word2id'] vocab = chkpt['dico_word2id']
vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in d.items()) vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
# Save pytorch-model # Save pytorch-model
pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -177,6 +177,38 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): ...@@ -177,6 +177,38 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
class TransfoXLConfig(PretrainedConfig): class TransfoXLConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `TransfoXLModel`. """Configuration class to store the configuration of a `TransfoXLModel`.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
cutoffs: cutoffs for the adaptive softmax
d_model: Dimensionality of the model's hidden states.
d_embed: Dimensionality of the embeddings
d_head: Dimensionality of the model's heads.
div_val: divident value for adapative input and softmax
pre_lnorm: apply LayerNorm to the input instead of the output
d_inner: Inner dimension in FF
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
tgt_len: number of tokens to predict
ext_len: length of the extended context
mem_len: length of the retained previous heads
same_length: use the same attn length for all tokens
proj_share_all_but_first: True to share all but first projs, False not to share.
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
clamp_len: use the same pos embeddings after clamp_len
sample_softmax: number of samples in sampled softmax
adaptive: use adaptive softmax
tie_weight: tie the word embedding and softmax weights
dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
dropatt: The dropout ratio for the attention probabilities.
untie_r: untie relative position biases
embd_pdrop: The dropout ratio for the embeddings.
init: parameter initializer to use
init_range: parameters initialized by U(-init_range, init_range).
proj_init_std: parameters initialized by N(0, init_std)
init_std: parameters initialized by N(0, init_std)
""" """
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
...@@ -210,38 +242,6 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -210,38 +242,6 @@ class TransfoXLConfig(PretrainedConfig):
init_std=0.02, init_std=0.02,
**kwargs): **kwargs):
"""Constructs TransfoXLConfig. """Constructs TransfoXLConfig.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
cutoffs: cutoffs for the adaptive softmax
d_model: Dimensionality of the model's hidden states.
d_embed: Dimensionality of the embeddings
d_head: Dimensionality of the model's heads.
div_val: divident value for adapative input and softmax
pre_lnorm: apply LayerNorm to the input instead of the output
d_inner: Inner dimension in FF
n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in
the Transformer encoder.
tgt_len: number of tokens to predict
ext_len: length of the extended context
mem_len: length of the retained previous heads
same_length: use the same attn length for all tokens
proj_share_all_but_first: True to share all but first projs, False not to share.
attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
clamp_len: use the same pos embeddings after clamp_len
sample_softmax: number of samples in sampled softmax
adaptive: use adaptive softmax
tie_weight: tie the word embedding and softmax weights
dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
dropatt: The dropout ratio for the attention probabilities.
untie_r: untie relative position biases
embd_pdrop: The dropout ratio for the embeddings.
init: parameter initializer to use
init_range: parameters initialized by U(-init_range, init_range).
proj_init_std: parameters initialized by N(0, init_std)
init_std: parameters initialized by N(0, init_std)
""" """
super(TransfoXLConfig, self).__init__(**kwargs) super(TransfoXLConfig, self).__init__(**kwargs)
...@@ -901,42 +901,20 @@ class TransfoXLPreTrainedModel(PreTrainedModel): ...@@ -901,42 +901,20 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
class TransfoXLModel(TransfoXLPreTrainedModel): class TransfoXLModel(TransfoXLPreTrainedModel):
"""Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"). """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that: Transformer XL uses relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
- you don't need to specify positioning embeddings indices
- the tokens in the vocabulary have to be sorted to decreasing frequency. - you don't need to specify positioning embeddings indices.
Params: - the tokens in the vocabulary have to be sorted in decreasing frequency.
Args:
config: a TransfoXLConfig class instance with the configuration to build a new model config: a TransfoXLConfig class instance with the configuration to build a new model
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`mems`: optional memomry of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Outputs:
A tuple of (last_hidden_state, new_mems)
`last_hidden_state`: the encoded-hidden-states at the top of the model
as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]
`new_mems`: list (num layers) of updated mem states at the entry of each layer
each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Example usage: Example::
```python
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
config = TransfoXLConfig() config = TransfoXLConfig()
model = TransfoXLModel(config) model = TransfoXLModel(config)
last_hidden_state, new_mems = model(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, new_mems)
```
""" """
def __init__(self, config): def __init__(self, config):
super(TransfoXLModel, self).__init__(config) super(TransfoXLModel, self).__init__(config)
...@@ -1200,18 +1178,40 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -1200,18 +1178,40 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return outputs # last hidden state, new_mems, (all hidden states), (all attentions) return outputs # last hidden state, new_mems, (all hidden states), (all attentions)
def forward(self, input_ids, mems=None, head_mask=None): def forward(self, input_ids, mems=None, head_mask=None):
""" Params: """
input_ids :: [bsz, len] Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
mems :: optional mems from previous forwar passes (or init_mems)
list (num layers) of mem states at the entry of each layer Args:
shape :: [self.config.mem_len, bsz, self.config.d_model] `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`mems`: optional memory of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels` Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Returns: Returns:
tuple (last_hidden, new_mems) where: A tuple of ``(last_hidden_state, new_mems)``.
new_mems: list (num layers) of mem states at the entry of each layer
shape :: [self.config.mem_len, bsz, self.config.d_model] ``last_hidden_state``: the encoded-hidden-states at the top of the model
last_hidden: output of the last layer: as a ``torch.FloatTensor`` of size [batch_size, sequence_length, self.config.d_model]
shape :: [bsz, len, self.config.d_model]
``new_mems``: list (num layers) of updated mem states at the entry of each layer
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
``labels``
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
last_hidden_state, new_mems = model(input_ids)
# or
last_hidden_state, new_mems = model.forward(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, new_mems)
""" """
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz] # so we transpose here from shape [bsz, len] to shape [len, bsz]
...@@ -1227,52 +1227,24 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -1227,52 +1227,24 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
"""Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"). """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
This model add an (adaptive) softmax head on top of the TransfoXLModel This model adds an (adaptive) softmax head on top of the ``TransfoXLModel``
Transformer XL uses a relative positioning (with sinusoidal patterns) and adaptive softmax inputs which means that:
Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
- you don't need to specify positioning embeddings indices - you don't need to specify positioning embeddings indices
- the tokens in the vocabulary have to be sorted to decreasing frequency.
Call self.tie_weights() if you update/load the weights of the transformer to keep the weights tied. - the tokens in the vocabulary have to be sorted in decreasing frequency.
Params: Call ``self.tie_weights()`` if you update/load the weights of the transformer to keep the weights tied.
config: a TransfoXLConfig class instance with the configuration to build a new model
Inputs: Args:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] config: a ``TransfoXLConfig`` class instance with the configuration to build a new model
with the token indices selected in the range [0, self.config.n_token[
`labels`: an optional torch.LongTensor of shape [batch_size, sequence_length]
with the labels token indices selected in the range [0, self.config.n_token[
`mems`: an optional memory of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Outputs:
A tuple of (last_hidden_state, new_mems)
`softmax_output`: output of the (adaptive) softmax:
if labels is None:
Negative log likelihood of shape [batch_size, sequence_length]
else:
log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]
`new_mems`: list (num layers) of updated mem states at the entry of each layer
each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Example usage: Example::
```python
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
config = TransfoXLConfig() config = TransfoXLConfig()
model = TransfoXLModel(config) model = TransfoXLModel(config)
last_hidden_state, new_mems = model(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
```
""" """
def __init__(self, config): def __init__(self, config):
super(TransfoXLLMHeadModel, self).__init__(config) super(TransfoXLLMHeadModel, self).__init__(config)
...@@ -1290,7 +1262,9 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -1290,7 +1262,9 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
self.tie_weights() self.tie_weights()
def tie_weights(self): def tie_weights(self):
""" Run this to be sure output and input (adaptive) softmax weights are tied """ """
Run this to be sure output and input (adaptive) softmax weights are tied
"""
# sampled softmax # sampled softmax
if self.sample_softmax > 0: if self.sample_softmax > 0:
if self.config.tie_weight: if self.config.tie_weight:
...@@ -1314,18 +1288,43 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -1314,18 +1288,43 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
return self.transformer.init_mems(data) return self.transformer.init_mems(data)
def forward(self, input_ids, labels=None, mems=None, head_mask=None): def forward(self, input_ids, labels=None, mems=None, head_mask=None):
""" Params: """
input_ids :: [bsz, len] Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
labels :: [bsz, len]
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`labels`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the labels token indices selected in the range [0, self.config.n_token[
`mems`: an optional memory of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Returns: Returns:
tuple(softmax_output, new_mems) where: A tuple of (last_hidden_state, new_mems)
new_mems: list (num layers) of hidden states at the entry of each layer
shape :: [mem_len, bsz, self.config.d_model] :: Warning: shapes are transposed here w. regards to input_ids ``last_hidden_state``: output of the (adaptive) softmax. If ``labels`` is ``None``, it is the negative
softmax_output: output of the (adaptive) softmax: log likelihood of shape [batch_size, sequence_length]. Otherwise, it is the log probabilities of
if labels is None: tokens of, shape [batch_size, sequence_length, n_tokens].
Negative log likelihood of shape :: [bsz, len]
else: ``new_mems``: list (num layers) of updated mem states at the entry of each layer
log probabilities of tokens, shape :: [bsz, len, n_tokens] each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
``labels``
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
last_hidden_state, new_mems = model(input_ids)
# or
last_hidden_state, new_mems = model.forward(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
""" """
bsz = input_ids.size(0) bsz = input_ids.size(0)
tgt_len = input_ids.size(1) tgt_len = input_ids.size(1)
......
This diff is collapsed.
This diff is collapsed.
...@@ -36,10 +36,24 @@ PRETRAINED_VOCAB_FILES_MAP = { ...@@ -36,10 +36,24 @@ PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file': 'vocab_file':
{ {
'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json", 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json",
'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json",
'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json",
'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json",
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
}, },
'merges_file': 'merges_file':
{ {
'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt", 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt",
'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt",
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
}, },
} }
...@@ -77,10 +91,15 @@ def text_standardize(text): ...@@ -77,10 +91,15 @@ def text_standardize(text):
class XLMTokenizer(PreTrainedTokenizer): class XLMTokenizer(PreTrainedTokenizer):
""" """
BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities: BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities:
- lower case all inputs - lower case all inputs
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
- argument special_tokens and function set_special_tokens: - uses `SpaCy tokenizer <https://spacy.io/api/tokenizer/>`_ and \
can be used to add additional symbols (ex: "__classify__") to a vocabulary. `ftfy <https://ftfy.readthedocs.io/en/latest/>`_ for pre-BPE tokenization if they are installed, \
fallback to BERT's BasicTokenizer if not.
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
(ex: "__classify__") to a vocabulary.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
......
...@@ -52,7 +52,8 @@ SEG_ID_PAD = 4 ...@@ -52,7 +52,8 @@ SEG_ID_PAD = 4
class XLNetTokenizer(PreTrainedTokenizer): class XLNetTokenizer(PreTrainedTokenizer):
""" """
SentencePiece based tokenizer. Peculiarities: SentencePiece based tokenizer. Peculiarities:
- requires SentencePiece: https://github.com/google/sentencepiece
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment