Unverified Commit a6f25118 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #694 from huggingface/release_0.6.3

Release 0.6.3
parents 80684f6f 4447f270
...@@ -309,6 +309,28 @@ predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] ...@@ -309,6 +309,28 @@ predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
assert predicted_token == '.</w>' assert predicted_token == '.</w>'
``` ```
And how to use `OpenAIGPTDoubleHeadsModel`
```python
# Load pre-trained model (weights)
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
model.eval()
# Prepare tokenized input
text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
tokenized_text1 = tokenizer.tokenize(text1)
tokenized_text2 = tokenizer.tokenize(text2)
indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
# Predict hidden states features for each layer
with torch.no_grad():
lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
```
### Transformer-XL ### Transformer-XL
Here is a quick-start example using `TransfoXLTokenizer`, `TransfoXLModel` and `TransfoXLModelLMHeadModel` class with the Transformer-XL model pre-trained on WikiText-103. See the [doc section](#doc) below for all the details on these classes. Here is a quick-start example using `TransfoXLTokenizer`, `TransfoXLModel` and `TransfoXLModelLMHeadModel` class with the Transformer-XL model pre-trained on WikiText-103. See the [doc section](#doc) below for all the details on these classes.
...@@ -456,6 +478,29 @@ predicted_index = torch.argmax(predictions_2[0, -1, :]).item() ...@@ -456,6 +478,29 @@ predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
predicted_token = tokenizer.decode([predicted_index]) predicted_token = tokenizer.decode([predicted_index])
``` ```
And how to use `GPT2DoubleHeadsModel`
```python
# Load pre-trained model (weights)
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
model.eval()
# Prepare tokenized input
text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
tokenized_text1 = tokenizer.tokenize(text1)
tokenized_text2 = tokenizer.tokenize(text2)
indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
# Predict hidden states features for each layer
with torch.no_grad():
lm_logits, multiple_choice_logits, past = model(tokens_tensor, mc_token_ids)
```
## Doc ## Doc
Here is a detailed documentation of the classes in the package and how to use them: Here is a detailed documentation of the classes in the package and how to use them:
...@@ -474,7 +519,7 @@ Here is a detailed documentation of the classes in the package and how to use th ...@@ -474,7 +519,7 @@ Here is a detailed documentation of the classes in the package and how to use th
To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated as To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated as
```python ```python
model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None) model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
``` ```
where where
...@@ -492,9 +537,12 @@ where ...@@ -492,9 +537,12 @@ where
- `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
- `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
- `bert-base-german-cased`: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters [Performance Evaluation](https://deepset.ai/german-bert) - `bert-base-german-cased`: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters [Performance Evaluation](https://deepset.ai/german-bert)
- `openai-gpt`: OpenAI English model, 12-layer, 768-hidden, 12-heads, 110M parameters - `bert-large-uncased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
- `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters - `bert-large-cased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
- `openai-gpt`: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
- `gpt2`: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters - `gpt2`: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
- `gpt2-medium`: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
- `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
- a path or url to a pretrained model archive containing: - a path or url to a pretrained model archive containing:
...@@ -502,7 +550,12 @@ where ...@@ -502,7 +550,12 @@ where
- `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel`, `TransfoXLModel`, `GPT2LMHeadModel` (saved with the usual `torch.save()`) - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel`, `TransfoXLModel`, `GPT2LMHeadModel` (saved with the usual `torch.save()`)
If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_pretrained_bert/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_pretrained_bert/`). If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_pretrained_bert/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_pretrained_bert/`).
- `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information). - `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
- `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
- `state_dict`: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
- `*inputs`, `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
`Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository. `Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.
...@@ -628,6 +681,13 @@ These configuration classes contains a few utilities to load and save configurat ...@@ -628,6 +681,13 @@ These configuration classes contains a few utilities to load and save configurat
`BertModel` is the basic BERT Transformer model with a layer of summed token, position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 for BERT-large). `BertModel` is the basic BERT Transformer model with a layer of summed token, position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 for BERT-large).
Instantiation:
The model can be instantiated with the following arguments:
- `config`: a `BertConfig` class instance with the configuration to build a new model.
- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
The inputs and output are **identical to the TensorFlow model inputs and outputs**. The inputs and output are **identical to the TensorFlow model inputs and outputs**.
We detail them here. This model takes as *inputs*: We detail them here. This model takes as *inputs*:
...@@ -636,6 +696,7 @@ We detail them here. This model takes as *inputs*: ...@@ -636,6 +696,7 @@ We detail them here. This model takes as *inputs*:
- `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details). - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
- `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences. - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
- `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`. - `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
This model *outputs* a tuple composed of: This model *outputs* a tuple composed of:
...@@ -753,6 +814,13 @@ where total_tokens_embeddings can be obtained as config.total_tokens_embeddings ...@@ -753,6 +814,13 @@ where total_tokens_embeddings can be obtained as config.total_tokens_embeddings
`total_tokens_embeddings = config.vocab_size + config.n_special` `total_tokens_embeddings = config.vocab_size + config.n_special`
You should use the associate indices to index the embeddings. You should use the associate indices to index the embeddings.
Instantiation:
The model can be instantiated with the following arguments:
- `config`: a `OpenAIConfig` class instance with the configuration to build a new model.
- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
The inputs and output are **identical to the TensorFlow model inputs and outputs**. The inputs and output are **identical to the TensorFlow model inputs and outputs**.
We detail them here. This model takes as *inputs*: We detail them here. This model takes as *inputs*:
...@@ -763,9 +831,10 @@ We detail them here. This model takes as *inputs*: ...@@ -763,9 +831,10 @@ We detail them here. This model takes as *inputs*:
- `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids - `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
You can use it to add a third type of embedding to each input token in the sequence You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block. (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
This model *outputs*: This model *outputs*:
- `hidden_states`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids) - `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
#### 10. `OpenAIGPTLMHeadModel` #### 10. `OpenAIGPTLMHeadModel`
...@@ -845,6 +914,13 @@ all_hidden_states = lower_hidden_states + [hidden_states] ...@@ -845,6 +914,13 @@ all_hidden_states = lower_hidden_states + [hidden_states]
`GPT2Model` is the OpenAI GPT-2 Transformer model with a layer of summed token and position embeddings followed by a series of 12 identical self-attention blocks. `GPT2Model` is the OpenAI GPT-2 Transformer model with a layer of summed token and position embeddings followed by a series of 12 identical self-attention blocks.
Instantiation:
The model can be instantiated with the following arguments:
- `config`: a `GPT2Config` class instance with the configuration to build a new model.
- `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
- `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False
The inputs and output are **identical to the TensorFlow model inputs and outputs**. The inputs and output are **identical to the TensorFlow model inputs and outputs**.
We detail them here. This model takes as *inputs*: We detail them here. This model takes as *inputs*:
...@@ -856,9 +932,10 @@ We detail them here. This model takes as *inputs*: ...@@ -856,9 +932,10 @@ We detail them here. This model takes as *inputs*:
You can use it to add a third type of embedding to each input token in the sequence You can use it to add a third type of embedding to each input token in the sequence
(the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block. (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
- `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states (key and values in the attention blocks) to speed up sequential decoding (this is the `presents` output of the model, cf. below). - `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states (key and values in the attention blocks) to speed up sequential decoding (this is the `presents` output of the model, cf. below).
- `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
This model *outputs*: This model *outputs*:
- `hidden_states`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids) - `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
- `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example). - `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example).
#### 15. `GPT2LMHeadModel` #### 15. `GPT2LMHeadModel`
......
...@@ -23,6 +23,9 @@ bert_docstring = """ ...@@ -23,6 +23,9 @@ bert_docstring = """
. `bert-base-multilingual-uncased` . `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased` . `bert-base-multilingual-cased`
. `bert-base-chinese` . `bert-base-chinese`
. `bert-base-german-cased`
. `bert-large-uncased-whole-word-masking`
. `bert-large-cased-whole-word-masking`
- a path or url to a pretrained model archive containing: - a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model . `bert_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
...@@ -81,6 +84,7 @@ def bertTokenizer(*args, **kwargs): ...@@ -81,6 +84,7 @@ def bertTokenizer(*args, **kwargs):
Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"] Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
Example: Example:
>>> import torch
>>> sentence = 'Hello, World!' >>> sentence = 'Hello, World!'
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
>>> toks = tokenizer.tokenize(sentence) >>> toks = tokenizer.tokenize(sentence)
...@@ -101,6 +105,7 @@ def bertModel(*args, **kwargs): ...@@ -101,6 +105,7 @@ def bertModel(*args, **kwargs):
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input # Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
...@@ -129,6 +134,7 @@ def bertForNextSentencePrediction(*args, **kwargs): ...@@ -129,6 +134,7 @@ def bertForNextSentencePrediction(*args, **kwargs):
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input # Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
...@@ -158,6 +164,7 @@ def bertForPreTraining(*args, **kwargs): ...@@ -158,6 +164,7 @@ def bertForPreTraining(*args, **kwargs):
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input # Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
...@@ -181,6 +188,7 @@ def bertForMaskedLM(*args, **kwargs): ...@@ -181,6 +188,7 @@ def bertForMaskedLM(*args, **kwargs):
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input # Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
...@@ -222,6 +230,7 @@ def bertForSequenceClassification(*args, **kwargs): ...@@ -222,6 +230,7 @@ def bertForSequenceClassification(*args, **kwargs):
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input # Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
...@@ -256,6 +265,7 @@ def bertForMultipleChoice(*args, **kwargs): ...@@ -256,6 +265,7 @@ def bertForMultipleChoice(*args, **kwargs):
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input # Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
...@@ -288,6 +298,7 @@ def bertForQuestionAnswering(*args, **kwargs): ...@@ -288,6 +298,7 @@ def bertForQuestionAnswering(*args, **kwargs):
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input # Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
...@@ -326,6 +337,7 @@ def bertForTokenClassification(*args, **kwargs): ...@@ -326,6 +337,7 @@ def bertForTokenClassification(*args, **kwargs):
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input # Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
......
...@@ -11,7 +11,7 @@ gpt2_docstring = """ ...@@ -11,7 +11,7 @@ gpt2_docstring = """
Params: Params:
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of: - a str with the name of a pre-trained model to load selected in the list of:
. `gpt2` . `gpt2`, `gpt2-medium`
- a path or url to a pretrained model archive containing: - a path or url to a pretrained model archive containing:
. `gpt2_config.json` a configuration file for the model . `gpt2_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a GPT2Model instance . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
...@@ -147,10 +147,14 @@ def gpt2DoubleHeadsModel(*args, **kwargs): ...@@ -147,10 +147,14 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2') >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
# Prepare tokenized input # Prepare tokenized input
>>> text = "Who was Jim Henson ?" >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
>>> indexed_tokens = tokenizer.encode(text) >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
>>> tokens_tensor = torch.tensor([indexed_tokens]) >>> tokenized_text1 = tokenizer.tokenize(text1)
>>> mc_token_ids = torch.LongTensor([ [len(indexed_tokens)] ]) >>> tokenized_text2 = tokenizer.tokenize(text2)
>>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
>>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
>>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
>>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
# Load gpt2DoubleHeadsModel # Load gpt2DoubleHeadsModel
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2') >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')
......
...@@ -126,7 +126,7 @@ def openAIGPTLMHeadModel(*args, **kwargs): ...@@ -126,7 +126,7 @@ def openAIGPTLMHeadModel(*args, **kwargs):
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch >>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt') >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
# Prepare tokenized input # Prepare tokenized input
...@@ -161,15 +161,18 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs): ...@@ -161,15 +161,18 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs):
Example: Example:
# Load the tokenizer # Load the tokenizer
>>> import torch >>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt') >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
# Prepare tokenized input # Prepare tokenized input
>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer" >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
>>> tokenized_text = tokenizer.tokenize(text) >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) >>> tokenized_text1 = tokenizer.tokenize(text1)
>>> tokens_tensor = torch.tensor([indexed_tokens]) >>> tokenized_text2 = tokenizer.tokenize(text2)
>>> mc_token_ids = torch.LongTensor([ [len(tokenized_text)] ]) >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
>>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
>>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
>>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
# Load openAIGPTDoubleHeadsModel # Load openAIGPTDoubleHeadsModel
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt') >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
......
This diff is collapsed.
This diff is collapsed.
...@@ -36,6 +36,7 @@ from torch.nn.parameter import Parameter ...@@ -36,6 +36,7 @@ from torch.nn.parameter import Parameter
from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
from .modeling import BertLayerNorm as LayerNorm from .modeling import BertLayerNorm as LayerNorm
from .modeling_gpt2 import prune_conv1d_layer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -256,7 +257,7 @@ class Conv1D(nn.Module): ...@@ -256,7 +257,7 @@ class Conv1D(nn.Module):
class Attention(nn.Module): class Attention(nn.Module):
def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False): def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
super(Attention, self).__init__() super(Attention, self).__init__()
n_state = nx # in Attention: n_state=768 (nx=n_embd) n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem] # [switch nx => n_state from Block to Attention to keep identical to TF implem]
...@@ -265,13 +266,31 @@ class Attention(nn.Module): ...@@ -265,13 +266,31 @@ class Attention(nn.Module):
self.n_head = config.n_head self.n_head = config.n_head
self.split_size = n_state self.split_size = n_state
self.scale = scale self.scale = scale
self.output_attentions = output_attentions self.output_attentions = output_attentions
self.keep_multihead_output = keep_multihead_output
self.multihead_output = None
self.c_attn = Conv1D(n_state * 3, 1, nx) self.c_attn = Conv1D(n_state * 3, 1, nx)
self.c_proj = Conv1D(n_state, 1, nx) self.c_proj = Conv1D(n_state, 1, nx)
self.attn_dropout = nn.Dropout(config.attn_pdrop) self.attn_dropout = nn.Dropout(config.attn_pdrop)
self.resid_dropout = nn.Dropout(config.resid_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop)
def _attn(self, q, k, v): def prune_heads(self, heads):
mask = torch.ones(self.n_head, self.split_size // self.n_head)
for head in heads:
mask[head] = 0
mask = mask.view(-1).contiguous().eq(1)
index = torch.arange(len(mask))[mask].long()
index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
# Prune conv1d layers
self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
# Update hyper params
self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
self.n_head = self.n_head - len(heads)
def _attn(self, q, k, v, head_mask=None):
w = torch.matmul(q, k) w = torch.matmul(q, k)
if self.scale: if self.scale:
w = w / math.sqrt(v.size(-1)) w = w / math.sqrt(v.size(-1))
...@@ -282,6 +301,11 @@ class Attention(nn.Module): ...@@ -282,6 +301,11 @@ class Attention(nn.Module):
w = nn.Softmax(dim=-1)(w) w = nn.Softmax(dim=-1)(w)
w = self.attn_dropout(w) w = self.attn_dropout(w)
# Mask heads if we want to
if head_mask is not None:
w = w * head_mask
if self.output_attentions: if self.output_attentions:
return w, torch.matmul(w, v) return w, torch.matmul(w, v)
return torch.matmul(w, v) return torch.matmul(w, v)
...@@ -299,13 +323,18 @@ class Attention(nn.Module): ...@@ -299,13 +323,18 @@ class Attention(nn.Module):
else: else:
return x.permute(0, 2, 1, 3) return x.permute(0, 2, 1, 3)
def forward(self, x): def forward(self, x, head_mask=None):
x = self.c_attn(x) x = self.c_attn(x)
query, key, value = x.split(self.split_size, dim=2) query, key, value = x.split(self.split_size, dim=2)
query = self.split_heads(query) query = self.split_heads(query)
key = self.split_heads(key, k=True) key = self.split_heads(key, k=True)
value = self.split_heads(value) value = self.split_heads(value)
a = self._attn(query, key, value)
a = self._attn(query, key, value, head_mask)
if self.keep_multihead_output:
self.multihead_output = a
self.multihead_output.retain_grad()
if self.output_attentions: if self.output_attentions:
attentions, a = a attentions, a = a
a = self.merge_heads(a) a = self.merge_heads(a)
...@@ -332,17 +361,17 @@ class MLP(nn.Module): ...@@ -332,17 +361,17 @@ class MLP(nn.Module):
class Block(nn.Module): class Block(nn.Module):
def __init__(self, n_ctx, config, scale=False, output_attentions=False): def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
super(Block, self).__init__() super(Block, self).__init__()
nx = config.n_embd nx = config.n_embd
self.output_attentions = output_attentions self.output_attentions = output_attentions
self.attn = Attention(nx, n_ctx, config, scale, output_attentions) self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)
self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
self.mlp = MLP(4 * nx, config) self.mlp = MLP(4 * nx, config)
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
def forward(self, x): def forward(self, x, head_mask=None):
a = self.attn(x) a = self.attn(x, head_mask=head_mask)
if self.output_attentions: if self.output_attentions:
attentions, a = a attentions, a = a
n = self.ln_1(x + a) n = self.ln_1(x + a)
...@@ -472,14 +501,19 @@ class OpenAIGPTPreTrainedModel(nn.Module): ...@@ -472,14 +501,19 @@ class OpenAIGPTPreTrainedModel(nn.Module):
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
resolved_config_file = cached_path(config_file, cache_dir=cache_dir) resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
except EnvironmentError: except EnvironmentError:
logger.error( if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
"Model name '{}' was not found in model name list ({}). " logger.error(
"We assumed '{}' was a path or url but couldn't find files {} and {} " "Couldn't reach server at '{}' to download pretrained weights.".format(
"at this path or url.".format( archive_file))
pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path, else:
archive_file, config_file logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find files {} and {} "
"at this path or url.".format(
pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
archive_file, config_file
)
) )
)
return None return None
if resolved_archive_file == archive_file and resolved_config_file == config_file: if resolved_archive_file == archive_file and resolved_config_file == config_file:
logger.info("loading weights file {}".format(archive_file)) logger.info("loading weights file {}".format(archive_file))
...@@ -579,7 +613,10 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -579,7 +613,10 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
You should use the associate indices to index the embeddings. You should use the associate indices to index the embeddings.
Params: Params:
config: a OpenAIGPTConfig class instance with the configuration to build a new model `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Inputs: Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
...@@ -591,10 +628,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -591,10 +628,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
(the previous two being the word and position embeddings). (the previous two being the word and position embeddings).
The input, position and token_type embeddings are summed inside the Transformer before the first The input, position and token_type embeddings are summed inside the Transformer before the first
self-attention block. self-attention block.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs: Outputs:
`hidden_states`: the encoded-hidden-states at the top of the model `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
(or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids) (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
Example usage: Example usage:
...@@ -609,13 +648,14 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -609,13 +648,14 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
``` ```
""" """
def __init__(self, config, output_attentions=False): def __init__(self, config, output_attentions=False, keep_multihead_output=False):
super(OpenAIGPTModel, self).__init__(config) super(OpenAIGPTModel, self).__init__(config)
self.output_attentions = output_attentions self.output_attentions = output_attentions
self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd) self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
self.positions_embed = nn.Embedding(config.n_positions, config.n_embd) self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
self.drop = nn.Dropout(config.embd_pdrop) self.drop = nn.Dropout(config.embd_pdrop)
block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions) block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,
keep_multihead_output=keep_multihead_output)
self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)]) self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
self.apply(self.init_weights) self.apply(self.init_weights)
...@@ -634,7 +674,20 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -634,7 +674,20 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
# Copy word embeddings from the previous weights # Copy word embeddings from the previous weights
self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :] self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
def forward(self, input_ids, position_ids=None, token_type_ids=None): def prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
for layer, heads in heads_to_prune.items():
self.h[layer].attn.prune_heads(heads)
def get_multihead_outputs(self):
""" Gather all multi-head outputs.
Return: list (layers) of multihead module outputs with gradients
"""
return [h.attn.multihead_output for h in self.h]
def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
if position_ids is None: if position_ids is None:
# This was used when we had a single embedding matrice from position and token embeddings # This was used when we had a single embedding matrice from position and token embeddings
# start = self.config.vocab_size + self.config.n_special # start = self.config.vocab_size + self.config.n_special
...@@ -643,6 +696,21 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -643,6 +696,21 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device) position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids) position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
# Prepare head mask if needed
# 1.0 in head_mask indicate we mask the head
# attention_probs has shape bsz x n_heads x N x N
# head_mask has shape n_layer x batch x n_heads x N x N
if head_mask is not None:
if head_mask.dim() == 1:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
head_mask = (1.0 - head_mask)
else:
head_mask = [None] * self.config.n_layer
input_shape = input_ids.size() input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_ids.size(-1)) input_ids = input_ids.view(-1, input_ids.size(-1))
position_ids = position_ids.view(-1, position_ids.size(-1)) position_ids = position_ids.view(-1, position_ids.size(-1))
...@@ -657,17 +725,22 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -657,17 +725,22 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
hidden_states = inputs_embeds + position_embeds + token_type_embeds hidden_states = inputs_embeds + position_embeds + token_type_embeds
hidden_states = self.drop(hidden_states) hidden_states = self.drop(hidden_states)
output_shape = input_shape + (hidden_states.size(-1),)
all_attentions = [] all_attentions = []
for block in self.h: all_hidden_states = [hidden_states.view(*output_shape)]
for i, block in enumerate(self.h):
outputs = block(hidden_states, head_mask[i])
if self.output_attentions: if self.output_attentions:
attentions, hidden_states = block(hidden_states) attentions, hidden_states = outputs
all_attentions.append(attentions) all_attentions.append(attentions)
else: else:
hidden_states = block(hidden_states) hidden_states = outputs
output_shape = input_shape + (hidden_states.size(-1),) all_hidden_states.append(hidden_states.view(*output_shape))
if self.output_attentions: if self.output_attentions:
return all_attentions, hidden_states.view(*output_shape) return all_attentions, all_hidden_states
return hidden_states.view(*output_shape) return all_hidden_states
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
...@@ -691,7 +764,10 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -691,7 +764,10 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
You should use the associate indices to index the embeddings. You should use the associate indices to index the embeddings.
Params: Params:
config: a OpenAIGPTConfig class instance with the configuration to build a new model `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Inputs: Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
...@@ -706,6 +782,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -706,6 +782,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size] is only computed for the labels set in [0, ..., vocab_size]
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs: Outputs:
if `lm_labels` is not `None`: if `lm_labels` is not `None`:
...@@ -726,9 +804,10 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -726,9 +804,10 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
``` ```
""" """
def __init__(self, config, output_attentions=False): def __init__(self, config, output_attentions=False, keep_multihead_output=False):
super(OpenAIGPTLMHeadModel, self).__init__(config) super(OpenAIGPTLMHeadModel, self).__init__(config)
self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions) self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,
keep_multihead_output=keep_multihead_output)
self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config) self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
self.apply(self.init_weights) self.apply(self.init_weights)
...@@ -740,10 +819,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -740,10 +819,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
self.transformer.set_num_special_tokens(num_special_tokens) self.transformer.set_num_special_tokens(num_special_tokens)
self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens) self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None): def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
hidden_states = self.transformer(input_ids, position_ids, token_type_ids) hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
if self.transformer.output_attentions: if self.transformer.output_attentions:
all_attentions, hidden_states = hidden_states all_attentions, hidden_states = hidden_states
hidden_states = hidden_states[-1]
lm_logits = self.lm_head(hidden_states) lm_logits = self.lm_head(hidden_states)
if lm_labels is not None: if lm_labels is not None:
# Shift so that tokens < n predict n # Shift so that tokens < n predict n
...@@ -780,7 +861,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -780,7 +861,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
You should use the associate indices to index the embeddings. You should use the associate indices to index the embeddings.
Params: Params:
config: a OpenAIGPTConfig class instance with the configuration to build a new model `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Inputs: Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
...@@ -799,6 +883,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -799,6 +883,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
is only computed for the labels set in [0, ..., total_tokens_embeddings] is only computed for the labels set in [0, ..., total_tokens_embeddings]
`multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_choices]. with indices selected in [0, ..., num_choices].
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs: Outputs:
if `lm_labels` and `multiple_choice_labels` are not `None`: if `lm_labels` and `multiple_choice_labels` are not `None`:
...@@ -820,9 +906,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -820,9 +906,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
``` ```
""" """
def __init__(self, config, output_attentions=False): def __init__(self, config, output_attentions=False, keep_multihead_output=False):
super(OpenAIGPTDoubleHeadsModel, self).__init__(config) super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions) self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,
keep_multihead_output=keep_multihead_output)
self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config) self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config) self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
self.apply(self.init_weights) self.apply(self.init_weights)
...@@ -835,10 +922,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -835,10 +922,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
self.transformer.set_num_special_tokens(num_special_tokens) self.transformer.set_num_special_tokens(num_special_tokens)
self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens) self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None): def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
hidden_states = self.transformer(input_ids, position_ids, token_type_ids) position_ids=None, head_mask=None):
hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
if self.transformer.output_attentions: if self.transformer.output_attentions:
all_attentions, hidden_states = hidden_states all_attentions, hidden_states = hidden_states
hidden_states = hidden_states[-1]
lm_logits = self.lm_head(hidden_states) lm_logits = self.lm_head(hidden_states)
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
losses = [] losses = []
......
...@@ -926,14 +926,19 @@ class TransfoXLPreTrainedModel(nn.Module): ...@@ -926,14 +926,19 @@ class TransfoXLPreTrainedModel(nn.Module):
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
resolved_config_file = cached_path(config_file, cache_dir=cache_dir) resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
except EnvironmentError: except EnvironmentError:
logger.error( if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
"Model name '{}' was not found in model name list ({}). " logger.error(
"We assumed '{}' was a path or url but couldn't find files {} and {} " "Couldn't reach server at '{}' to download pretrained weights.".format(
"at this path or url.".format( archive_file))
pretrained_model_name_or_path, else:
', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), logger.error(
pretrained_model_name_or_path, "Model name '{}' was not found in model name list ({}). "
archive_file, config_file)) "We assumed '{}' was a path or url but couldn't find files {} and {} "
"at this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
pretrained_model_name_or_path,
archive_file, config_file))
return None return None
if resolved_archive_file == archive_file and resolved_config_file == config_file: if resolved_archive_file == archive_file and resolved_config_file == config_file:
logger.info("loading weights file {}".format(archive_file)) logger.info("loading weights file {}".format(archive_file))
......
...@@ -35,6 +35,8 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = { ...@@ -35,6 +35,8 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
} }
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
'bert-base-uncased': 512, 'bert-base-uncased': 512,
...@@ -45,6 +47,8 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { ...@@ -45,6 +47,8 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
'bert-base-multilingual-cased': 512, 'bert-base-multilingual-cased': 512,
'bert-base-chinese': 512, 'bert-base-chinese': 512,
'bert-base-german-cased': 512, 'bert-base-german-cased': 512,
'bert-large-uncased-whole-word-masking': 512,
'bert-large-cased-whole-word-masking': 512,
} }
VOCAB_NAME = 'vocab.txt' VOCAB_NAME = 'vocab.txt'
...@@ -177,13 +181,18 @@ class BertTokenizer(object): ...@@ -177,13 +181,18 @@ class BertTokenizer(object):
try: try:
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
except EnvironmentError: except EnvironmentError:
logger.error( if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
"Model name '{}' was not found in model name list ({}). " logger.error(
"We assumed '{}' was a path or url but couldn't find any file " "Couldn't reach server at '{}' to download vocabulary.".format(
"associated to this path or url.".format( vocab_file))
pretrained_model_name_or_path, else:
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), logger.error(
vocab_file)) "Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
vocab_file))
return None return None
if resolved_vocab_file == vocab_file: if resolved_vocab_file == vocab_file:
logger.info("loading vocabulary file {}".format(vocab_file)) logger.info("loading vocabulary file {}".format(vocab_file))
......
...@@ -113,14 +113,19 @@ class GPT2Tokenizer(object): ...@@ -113,14 +113,19 @@ class GPT2Tokenizer(object):
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir) resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
except EnvironmentError: except EnvironmentError:
logger.error( if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
"Model name '{}' was not found in model name list ({}). " logger.error(
"We assumed '{}' was a path or url but couldn't find files {} and {} " "Couldn't reach server at '{}' to download vocabulary.".format(
"at this path or url.".format( vocab_file))
pretrained_model_name_or_path, else:
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), logger.error(
pretrained_model_name_or_path, "Model name '{}' was not found in model name list ({}). "
vocab_file, merges_file)) "We assumed '{}' was a path or url but couldn't find files {} and {} "
"at this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
pretrained_model_name_or_path,
vocab_file, merges_file))
return None return None
if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
logger.info("loading vocabulary file {}".format(vocab_file)) logger.info("loading vocabulary file {}".format(vocab_file))
......
...@@ -101,14 +101,19 @@ class OpenAIGPTTokenizer(object): ...@@ -101,14 +101,19 @@ class OpenAIGPTTokenizer(object):
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir) resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
except EnvironmentError: except EnvironmentError:
logger.error( if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
"Model name '{}' was not found in model name list ({}). " logger.error(
"We assumed '{}' was a path or url but couldn't find files {} and {} " "Couldn't reach server at '{}' to download vocabulary.".format(
"at this path or url.".format( vocab_file))
pretrained_model_name_or_path, else:
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), logger.error(
pretrained_model_name_or_path, "Model name '{}' was not found in model name list ({}). "
vocab_file, merges_file)) "We assumed '{}' was a path or url but couldn't find files {} and {} "
"at this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
pretrained_model_name_or_path,
vocab_file, merges_file))
return None return None
if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
logger.info("loading vocabulary file {}".format(vocab_file)) logger.info("loading vocabulary file {}".format(vocab_file))
......
...@@ -71,14 +71,19 @@ class TransfoXLTokenizer(object): ...@@ -71,14 +71,19 @@ class TransfoXLTokenizer(object):
try: try:
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
except EnvironmentError: except EnvironmentError:
logger.error( if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
"Model name '{}' was not found in model name list ({}). " logger.error(
"We assumed '{}' was a path or url but couldn't find files {} " "Couldn't reach server at '{}' to download vocabulary.".format(
"at this path or url.".format( vocab_file))
pretrained_model_name_or_path, else:
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), logger.error(
pretrained_model_name_or_path, "Model name '{}' was not found in model name list ({}). "
vocab_file)) "We assumed '{}' was a path or url but couldn't find files {} "
"at this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
pretrained_model_name_or_path,
vocab_file))
return None return None
if resolved_vocab_file == vocab_file: if resolved_vocab_file == vocab_file:
logger.info("loading vocabulary file {}".format(vocab_file)) logger.info("loading vocabulary file {}".format(vocab_file))
......
...@@ -115,8 +115,9 @@ class GPT2ModelTest(unittest.TestCase): ...@@ -115,8 +115,9 @@ class GPT2ModelTest(unittest.TestCase):
return outputs return outputs
def check_gpt2_model_output(self, result): def check_gpt2_model_output(self, result):
self.parent.assertEqual(len(result["hidden_states"]), self.n_layer + 1)
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["hidden_states"].size()), list(result["hidden_states"][0].size()),
[self.batch_size, self.n_choices, self.seq_length, self.n_embd]) [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
...@@ -209,6 +210,98 @@ class GPT2ModelTest(unittest.TestCase): ...@@ -209,6 +210,98 @@ class GPT2ModelTest(unittest.TestCase):
[list(l.size()) for l in result["loss"]], [list(l.size()) for l in result["loss"]],
[[], []]) [[], []])
def create_and_check_gpt2_for_headmasking(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids):
for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
model = model_class(config=config, keep_multihead_output=True)
model.eval()
head_mask = torch.zeros(self.n_layer, self.n_head).to(input_ids.device)
head_mask[0, 1:-1] = 1.0 # Mask all but the first and last heads on the first layer
head_mask[-1, 1:] = 1.0 # Mask all but the first head on the last layer
if isinstance(model, GPT2DoubleHeadsModel):
output = model(input_ids, mc_token_ids, head_mask=head_mask)
else:
output = model(input_ids, head_mask=head_mask)
if isinstance(model, GPT2Model):
output = sum(t.sum() for t in output[0])
elif isinstance(output, (list, tuple)):
output = sum(t.sum() for t in output[:-1])
output = output.sum()
output.backward()
multihead_outputs = (model if isinstance(model, GPT2Model) else model.transformer).get_multihead_outputs()
self.parent.assertEqual(len(multihead_outputs), self.n_layer)
self.parent.assertListEqual(
list(multihead_outputs[0].size()),
[self.batch_size * self.n_choices, self.n_head,
self.seq_length, self.n_embd // self.n_head])
self.parent.assertEqual(
len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),
0)
self.parent.assertEqual(
len(multihead_outputs[0][:, 0, :, :].nonzero()),
self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
self.parent.assertEqual(
len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
self.parent.assertListEqual(
list(multihead_outputs[1].size()),
[self.batch_size * self.n_choices, self.n_head,
self.seq_length, self.n_embd // self.n_head])
self.parent.assertEqual(
len(multihead_outputs[1].nonzero()),
multihead_outputs[1].numel())
self.parent.assertListEqual(
list(multihead_outputs[-1].size()),
[self.batch_size * self.n_choices, self.n_head,
self.seq_length, self.n_embd // self.n_head])
self.parent.assertEqual(
len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
0)
self.parent.assertEqual(
len(multihead_outputs[-1][:, 0, :, :].nonzero()),
self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
def create_and_check_gpt2_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids):
for model_class in (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel):
model = model_class(config=config, keep_multihead_output=True)
model.eval()
transformer = model if isinstance(model, GPT2Model) else model.transformer
heads_to_prune = {0: list(range(1, self.n_head)),
-1: [0]}
transformer.prune_heads(heads_to_prune)
if isinstance(model, GPT2DoubleHeadsModel):
output = model(input_ids, mc_token_ids)
else:
output = model(input_ids)
if isinstance(model, GPT2Model):
output = sum(t.sum() for t in output[0])
elif isinstance(output, (list, tuple)):
output = sum(t.sum() for t in output[:-1])
output = output.sum()
output.backward()
multihead_outputs = transformer.get_multihead_outputs()
self.parent.assertEqual(len(multihead_outputs), self.n_layer)
self.parent.assertListEqual(
list(multihead_outputs[0].size()),
[self.batch_size * self.n_choices, 1,
self.seq_length, self.n_embd // self.n_head])
self.parent.assertListEqual(
list(multihead_outputs[1].size()),
[self.batch_size * self.n_choices, self.n_head,
self.seq_length, self.n_embd // self.n_head])
self.parent.assertListEqual(
list(multihead_outputs[-1].size()),
[self.batch_size * self.n_choices, self.n_head-1,
self.seq_length, self.n_embd // self.n_head])
def test_default(self): def test_default(self):
self.run_tester(GPT2ModelTest.GPT2ModelTester(self)) self.run_tester(GPT2ModelTest.GPT2ModelTester(self))
...@@ -247,6 +340,9 @@ class GPT2ModelTest(unittest.TestCase): ...@@ -247,6 +340,9 @@ class GPT2ModelTest(unittest.TestCase):
tester.check_gpt2_double_heads_output(output_result) tester.check_gpt2_double_heads_output(output_result)
tester.check_gpt2_double_heads_loss_output(output_result) tester.check_gpt2_double_heads_loss_output(output_result)
tester.create_and_check_gpt2_for_headmasking(*config_and_inputs)
tester.create_and_check_gpt2_for_head_pruning(*config_and_inputs)
@classmethod @classmethod
def ids_tensor(cls, shape, vocab_size, rng=None, name=None): def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
"""Creates a random int32 tensor of the shape within the vocab size.""" """Creates a random int32 tensor of the shape within the vocab size."""
......
...@@ -125,8 +125,9 @@ class OpenAIGPTModelTest(unittest.TestCase): ...@@ -125,8 +125,9 @@ class OpenAIGPTModelTest(unittest.TestCase):
return outputs return outputs
def check_openai_model_output(self, result): def check_openai_model_output(self, result):
self.parent.assertEqual(len(result["hidden_states"]), self.n_layer + 1)
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["hidden_states"].size()), list(result["hidden_states"][0].size()),
[self.batch_size, self.n_choices, self.seq_length, self.n_embd]) [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
...@@ -182,6 +183,99 @@ class OpenAIGPTModelTest(unittest.TestCase): ...@@ -182,6 +183,99 @@ class OpenAIGPTModelTest(unittest.TestCase):
[list(l.size()) for l in result["loss"]], [list(l.size()) for l in result["loss"]],
[[], []]) [[], []])
def create_and_check_openai_for_headmasking(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids):
for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
model = model_class(config=config, keep_multihead_output=True)
model.eval()
head_mask = torch.zeros(self.n_layer, self.n_head).to(input_ids.device)
head_mask[0, 1:-1] = 1.0 # Mask all but the first and last heads on the first layer
head_mask[-1, 1:] = 1.0 # Mask all but the first head on the last layer
if isinstance(model, OpenAIGPTDoubleHeadsModel):
output = model(input_ids, mc_token_ids, head_mask=head_mask)
else:
output = model(input_ids, head_mask=head_mask)
if isinstance(model, OpenAIGPTModel):
output = sum(t.sum() for t in output[0])
elif isinstance(output, (list, tuple)):
output = sum(t.sum() for t in output)
output = output.sum()
output.backward()
multihead_outputs = (model if isinstance(model, OpenAIGPTModel) else model.transformer).get_multihead_outputs()
self.parent.assertEqual(len(multihead_outputs), self.n_layer)
self.parent.assertListEqual(
list(multihead_outputs[0].size()),
[self.batch_size * self.n_choices, self.n_head,
self.seq_length, self.n_embd // self.n_head])
self.parent.assertEqual(
len(multihead_outputs[0][:, 1:(self.n_head-1), :, :].nonzero()),
0)
self.parent.assertEqual(
len(multihead_outputs[0][:, 0, :, :].nonzero()),
self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
self.parent.assertEqual(
len(multihead_outputs[0][:, self.n_head-1, :, :].nonzero()),
self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
self.parent.assertListEqual(
list(multihead_outputs[1].size()),
[self.batch_size * self.n_choices, self.n_head,
self.seq_length, self.n_embd // self.n_head])
self.parent.assertEqual(
len(multihead_outputs[1].nonzero()),
multihead_outputs[1].numel())
self.parent.assertListEqual(
list(multihead_outputs[-1].size()),
[self.batch_size * self.n_choices, self.n_head,
self.seq_length, self.n_embd // self.n_head])
self.parent.assertEqual(
len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
0)
self.parent.assertEqual(
len(multihead_outputs[-1][:, 0, :, :].nonzero()),
self.batch_size * self.n_choices * self.seq_length * self.n_embd // self.n_head)
def create_and_check_openai_for_head_pruning(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids):
for model_class in (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel):
model = model_class(config=config, keep_multihead_output=True)
model.eval()
transformer = model if isinstance(model, OpenAIGPTModel) else model.transformer
heads_to_prune = {0: list(range(1, self.n_head)),
-1: [0]}
transformer.prune_heads(heads_to_prune)
if isinstance(model, OpenAIGPTDoubleHeadsModel):
output = model(input_ids, mc_token_ids)
else:
output = model(input_ids)
if isinstance(model, OpenAIGPTModel):
output = sum(t.sum() for t in output[0])
elif isinstance(output, (list, tuple)):
output = sum(t.sum() for t in output)
output = output.sum()
output.backward()
multihead_outputs = transformer.get_multihead_outputs()
self.parent.assertEqual(len(multihead_outputs), self.n_layer)
self.parent.assertListEqual(
list(multihead_outputs[0].size()),
[self.batch_size * self.n_choices, 1,
self.seq_length, self.n_embd // self.n_head])
self.parent.assertListEqual(
list(multihead_outputs[1].size()),
[self.batch_size * self.n_choices, self.n_head,
self.seq_length, self.n_embd // self.n_head])
self.parent.assertListEqual(
list(multihead_outputs[-1].size()),
[self.batch_size * self.n_choices, self.n_head-1,
self.seq_length, self.n_embd // self.n_head])
def test_default(self): def test_default(self):
self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self)) self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))
...@@ -220,6 +314,9 @@ class OpenAIGPTModelTest(unittest.TestCase): ...@@ -220,6 +314,9 @@ class OpenAIGPTModelTest(unittest.TestCase):
tester.check_openai_double_heads_output(output_result) tester.check_openai_double_heads_output(output_result)
tester.check_openai_double_heads_loss_output(output_result) tester.check_openai_double_heads_loss_output(output_result)
tester.create_and_check_openai_for_headmasking(*config_and_inputs)
tester.create_and_check_openai_for_head_pruning(*config_and_inputs)
@classmethod @classmethod
def ids_tensor(cls, shape, vocab_size, rng=None, name=None): def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
"""Creates a random int32 tensor of the shape within the vocab size.""" """Creates a random int32 tensor of the shape within the vocab size."""
......
...@@ -293,6 +293,107 @@ class BertModelTest(unittest.TestCase): ...@@ -293,6 +293,107 @@ class BertModelTest(unittest.TestCase):
[self.batch_size, self.num_attention_heads, self.seq_length, self.seq_length]) [self.batch_size, self.num_attention_heads, self.seq_length, self.seq_length])
def create_and_check_bert_for_headmasking(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification):
if model_class in [BertForSequenceClassification,
BertForTokenClassification]:
model = model_class(config=config,
num_labels=self.num_labels,
keep_multihead_output=True)
else:
model = model_class(config=config, keep_multihead_output=True)
model.eval()
head_mask = torch.zeros(self.num_hidden_layers, self.num_attention_heads).to(input_ids.device)
head_mask[0, 1:-1] = 1.0 # Mask all but the first and last heads on the first layer
head_mask[-1, 1:] = 1.0 # Mask all but the first head on the last layer
output = model(input_ids, token_type_ids, input_mask, head_mask=head_mask)
if isinstance(model, BertModel):
output = sum(t.sum() for t in output[0])
elif isinstance(output, (list, tuple)):
output = sum(t.sum() for t in output)
output = output.sum()
output.backward()
multihead_outputs = (model if isinstance(model, BertModel) else model.bert).get_multihead_outputs()
self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
self.parent.assertListEqual(
list(multihead_outputs[0].size()),
[self.batch_size, self.num_attention_heads,
self.seq_length, self.hidden_size // self.num_attention_heads])
self.parent.assertEqual(
len(multihead_outputs[0][:, 1:(self.num_attention_heads-1), :, :].nonzero()),
0)
self.parent.assertEqual(
len(multihead_outputs[0][:, 0, :, :].nonzero()),
self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
self.parent.assertEqual(
len(multihead_outputs[0][:, self.num_attention_heads-1, :, :].nonzero()),
self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
self.parent.assertListEqual(
list(multihead_outputs[1].size()),
[self.batch_size, self.num_attention_heads,
self.seq_length, self.hidden_size // self.num_attention_heads])
self.parent.assertEqual(
len(multihead_outputs[1].nonzero()),
multihead_outputs[1].numel())
self.parent.assertListEqual(
list(multihead_outputs[-1].size()),
[self.batch_size, self.num_attention_heads,
self.seq_length, self.hidden_size // self.num_attention_heads])
self.parent.assertEqual(
len(multihead_outputs[-1][:, 1:, :, :].nonzero()),
0)
self.parent.assertEqual(
len(multihead_outputs[-1][:, 0, :, :].nonzero()),
self.batch_size * self.seq_length * self.hidden_size // self.num_attention_heads)
def create_and_check_bert_for_head_pruning(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
for model_class in (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification):
if model_class in [BertForSequenceClassification,
BertForTokenClassification]:
model = model_class(config=config,
num_labels=self.num_labels,
keep_multihead_output=True)
else:
model = model_class(config=config, keep_multihead_output=True)
model.eval()
bert_model = model if isinstance(model, BertModel) else model.bert
heads_to_prune = {0: list(range(1, self.num_attention_heads)),
-1: [0]}
bert_model.prune_heads(heads_to_prune)
output = model(input_ids, token_type_ids, input_mask)
if isinstance(model, BertModel):
output = sum(t.sum() for t in output[0])
elif isinstance(output, (list, tuple)):
output = sum(t.sum() for t in output)
output = output.sum()
output.backward()
multihead_outputs = bert_model.get_multihead_outputs()
self.parent.assertEqual(len(multihead_outputs), self.num_hidden_layers)
self.parent.assertListEqual(
list(multihead_outputs[0].size()),
[self.batch_size, 1,
self.seq_length, self.hidden_size // self.num_attention_heads])
self.parent.assertListEqual(
list(multihead_outputs[1].size()),
[self.batch_size, self.num_attention_heads,
self.seq_length, self.hidden_size // self.num_attention_heads])
self.parent.assertListEqual(
list(multihead_outputs[-1].size()),
[self.batch_size, self.num_attention_heads-1,
self.seq_length, self.hidden_size // self.num_attention_heads])
def test_default(self): def test_default(self):
self.run_tester(BertModelTest.BertModelTester(self)) self.run_tester(BertModelTest.BertModelTester(self))
...@@ -352,6 +453,8 @@ class BertModelTest(unittest.TestCase): ...@@ -352,6 +453,8 @@ class BertModelTest(unittest.TestCase):
tester.check_loss_output(output_result) tester.check_loss_output(output_result)
tester.create_and_check_bert_for_attentions(*config_and_inputs) tester.create_and_check_bert_for_attentions(*config_and_inputs)
tester.create_and_check_bert_for_headmasking(*config_and_inputs)
tester.create_and_check_bert_for_head_pruning(*config_and_inputs)
@classmethod @classmethod
def ids_tensor(cls, shape, vocab_size, rng=None, name=None): def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment