Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
72863735
Commit
72863735
authored
Sep 09, 2019
by
thomwolf
Browse files
WIP GPT2
parent
34f28b2a
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
507 additions
and
426 deletions
+507
-426
pytorch_transformers/modeling_tf_bert.py
pytorch_transformers/modeling_tf_bert.py
+12
-107
pytorch_transformers/modeling_tf_gpt2.py
pytorch_transformers/modeling_tf_gpt2.py
+162
-310
pytorch_transformers/modeling_tf_utils.py
pytorch_transformers/modeling_tf_utils.py
+106
-4
pytorch_transformers/tests/modeling_gpt2_test.py
pytorch_transformers/tests/modeling_gpt2_test.py
+11
-5
pytorch_transformers/tests/modeling_tf_gpt2_test.py
pytorch_transformers/tests/modeling_tf_gpt2_test.py
+216
-0
No files found.
pytorch_transformers/modeling_tf_bert.py
View file @
72863735
...
@@ -704,20 +704,7 @@ class TFBertModel(TFBertPreTrainedModel):
...
@@ -704,20 +704,7 @@ class TFBertModel(TFBertPreTrainedModel):
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
class
TFBertForPreTraining
(
TFBertPreTrainedModel
):
class
TFBertForPreTraining
(
TFBertPreTrainedModel
):
r
"""
r
"""
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
**next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
Indices should be in ``[0, 1]``.
``0`` indicates sequence B is a continuation of sequence A,
``1`` indicates sequence B is a random sequence.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when both ``masked_lm_labels`` and ``next_sentence_label`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)``
**seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)``
...
@@ -762,15 +749,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
...
@@ -762,15 +749,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
class
TFBertForMaskedLM
(
TFBertPreTrainedModel
):
class
TFBertForMaskedLM
(
TFBertPreTrainedModel
):
r
"""
r
"""
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
...
@@ -786,8 +765,8 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
...
@@ -786,8 +765,8 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids
, masked_lm_labels=input_ids
)
outputs = model(input_ids)
loss,
prediction_scores = outputs[:2]
prediction_scores = outputs[:2]
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
...
@@ -811,12 +790,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
...
@@ -811,12 +790,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
class
TFBertForNextSentencePrediction
(
TFBertPreTrainedModel
):
class
TFBertForNextSentencePrediction
(
TFBertPreTrainedModel
):
r
"""
r
"""
**next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
Indices should be in ``[0, 1]``.
``0`` indicates sequence B is a continuation of sequence A,
``1`` indicates sequence B is a random sequence.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
**loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Next sequence prediction (classification) loss.
Next sequence prediction (classification) loss.
...
@@ -862,15 +835,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
...
@@ -862,15 +835,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
class
TFBertForSequenceClassification
(
TFBertPreTrainedModel
):
class
TFBertForSequenceClassification
(
TFBertPreTrainedModel
):
r
"""
r
"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the sequence classification/regression loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification (or regression if config.num_labels==1) loss.
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax).
Classification (or regression if config.num_labels==1) scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
...
@@ -886,8 +851,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
...
@@ -886,8 +851,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
loss, logits = outputs[:2]
"""
"""
...
@@ -905,6 +869,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
...
@@ -905,6 +869,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
pooled_output
=
outputs
[
1
]
pooled_output
=
outputs
[
1
]
if
training
:
pooled_output
=
self
.
dropout
(
pooled_output
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
...
@@ -915,53 +880,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
...
@@ -915,53 +880,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
@
add_start_docstrings
(
"""Bert Model with a multiple choice classification head on top (a linear layer on top of
@
add_start_docstrings
(
"""Bert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """
,
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """
,
BERT_START_DOCSTRING
)
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
class
TFBertForMultipleChoice
(
TFBertPreTrainedModel
):
class
TFBertForMultipleChoice
(
TFBertPreTrainedModel
):
r
"""
r
"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs:
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
(b) For single sequences:
``tokens: [CLS] the dog is hairy . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0``
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Mask to avoid performing attention on padding token indices.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification loss.
**classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
**classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above).
of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
Classification scores (before SoftMax).
...
@@ -979,8 +901,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
...
@@ -979,8 +901,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
loss, classification_scores = outputs[:2]
"""
"""
...
@@ -1025,6 +946,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
...
@@ -1025,6 +946,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
pooled_output
=
outputs
[
1
]
pooled_output
=
outputs
[
1
]
if
training
:
pooled_output
=
self
.
dropout
(
pooled_output
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
reshaped_logits
=
tf
.
reshape
(
logits
,
(
-
1
,
num_choices
))
reshaped_logits
=
tf
.
reshape
(
logits
,
(
-
1
,
num_choices
))
...
@@ -1039,13 +961,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
...
@@ -1039,13 +961,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
class
TFBertForTokenClassification
(
TFBertPreTrainedModel
):
class
TFBertForTokenClassification
(
TFBertPreTrainedModel
):
r
"""
r
"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification loss.
**scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
**scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
Classification scores (before SoftMax).
Classification scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
...
@@ -1061,8 +977,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
...
@@ -1061,8 +977,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
loss, scores = outputs[:2]
"""
"""
...
@@ -1080,6 +995,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
...
@@ -1080,6 +995,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
if
training
:
sequence_output
=
self
.
dropout
(
sequence_output
)
sequence_output
=
self
.
dropout
(
sequence_output
)
logits
=
self
.
classifier
(
sequence_output
)
logits
=
self
.
classifier
(
sequence_output
)
...
@@ -1093,18 +1009,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
...
@@ -1093,18 +1009,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
class
TFBertForQuestionAnswering
(
TFBertPreTrainedModel
):
class
TFBertForQuestionAnswering
(
TFBertPreTrainedModel
):
r
"""
r
"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-start scores (before SoftMax).
Span-start scores (before SoftMax).
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
...
...
pytorch_transformers/modeling_tf_gpt2.py
View file @
72863735
...
@@ -28,13 +28,13 @@ from io import open
...
@@ -28,13 +28,13 @@ from io import open
import
numpy
as
np
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFConv1D
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFConv1D
,
TFSequenceSummary
,
shape_list
from
.configuration_gpt2
import
GPT2Config
from
.configuration_gpt2
import
GPT2Config
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
=
{
"gpt2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5"
,
TF_
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
=
{
"gpt2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5"
,
"gpt2-medium"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5"
,
"gpt2-medium"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5"
,
"gpt2-large"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5"
}
"gpt2-large"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5"
}
...
@@ -139,7 +139,7 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -139,7 +139,7 @@ class TFAttention(tf.keras.layers.Layer):
@
staticmethod
@
staticmethod
@
tf
.
function
@
tf
.
function
def
attention_mask
(
nd
,
ns
,
dtype
):
def
causal_
attention_mask
(
nd
,
ns
,
dtype
):
"""1's in the lower triangle, counting from the lower right corner.
"""1's in the lower triangle, counting from the lower right corner.
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
"""
"""
...
@@ -150,20 +150,24 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -150,20 +150,24 @@ class TFAttention(tf.keras.layers.Layer):
@
tf
.
function
@
tf
.
function
def
_attn
(
self
,
inputs
,
training
=
False
):
def
_attn
(
self
,
inputs
,
training
=
False
):
q
,
k
,
v
,
head_mask
=
inputs
q
,
k
,
v
,
attention_mask
,
head_mask
=
inputs
# q, k, v have shape [batch, heads, sequence, features]
# q, k, v have shape [batch, heads, sequence, features]
w
=
tf
.
matmul
(
q
,
k
,
transpose_b
=
True
)
w
=
tf
.
matmul
(
q
,
k
,
transpose_b
=
True
)
if
self
.
scale
:
if
self
.
scale
:
n_state
=
shape_list
(
v
)[
-
1
]
dk
=
tf
.
cast
(
tf
.
shape
(
k
)[
-
1
],
tf
.
float32
)
# scale attention_scores
w
=
w
*
tf
.
rsqrt
(
tf
.
cast
(
v
.
shape
[
-
1
].
value
,
w
.
dtype
)
)
w
=
w
/
tf
.
math
.
sqrt
(
dk
)
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
_
,
_
,
nd
,
ns
=
shape_list
(
w
)
_
,
_
,
nd
,
ns
=
shape_list
(
w
)
b
=
self
.
attention_mask
(
nd
,
ns
,
dtype
=
w
.
dtype
)
b
=
self
.
causal_
attention_mask
(
nd
,
ns
,
dtype
=
w
.
dtype
)
b
=
tf
.
reshape
(
b
,
[
1
,
1
,
nd
,
ns
])
b
=
tf
.
reshape
(
b
,
[
1
,
1
,
nd
,
ns
])
w
=
w
*
b
-
1e4
*
(
1
-
b
)
w
=
w
*
b
-
1e4
*
(
1
-
b
)
w
=
tf
.
nn
.
softmax
(
w
)
if
attention_mask
is
not
None
:
# Apply the attention mask
w
=
w
+
attention_mask
w
=
tf
.
nn
.
softmax
(
w
,
axis
=-
1
)
if
training
:
if
training
:
w
=
self
.
attn_dropout
(
w
)
w
=
self
.
attn_dropout
(
w
)
...
@@ -179,20 +183,20 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -179,20 +183,20 @@ class TFAttention(tf.keras.layers.Layer):
@
tf
.
function
@
tf
.
function
def
merge_heads
(
self
,
x
):
def
merge_heads
(
self
,
x
):
x
=
tf
.
transpose
(
x
,
[
0
,
2
,
1
,
3
])
x
=
tf
.
transpose
(
x
,
[
0
,
2
,
1
,
3
])
x_shape
=
tf
.
shape
(
x
)
x_shape
=
shape
_list
(
x
)
new_x_shape
=
x_shape
[:
-
2
]
+
(
x_shape
[
-
2
]
*
x_shape
[
-
1
]
,)
new_x_shape
=
x_shape
[:
-
2
]
+
[
x_shape
[
-
2
]
*
x_shape
[
-
1
]
]
return
tf
.
reshape
(
x
,
new_x_shape
)
return
tf
.
reshape
(
x
,
new_x_shape
)
@
tf
.
function
@
tf
.
function
def
split_heads
(
self
,
x
):
def
split_heads
(
self
,
x
):
x_shape
=
tf
.
shape
(
x
)
x_shape
=
shape
_list
(
x
)
new_x_shape
=
x_shape
[:
-
1
]
+
(
self
.
n_head
,
x_shape
[
-
1
]
//
self
.
n_head
)
new_x_shape
=
x_shape
[:
-
1
]
+
[
self
.
n_head
,
x_shape
[
-
1
]
//
self
.
n_head
]
x
=
tf
.
reshape
(
x
,
new_x_shape
)
x
=
tf
.
reshape
(
x
,
new_x_shape
)
return
tf
.
transpose
(
x
,
(
0
,
2
,
1
,
3
))
# (batch, head, seq_length, head_features)
return
tf
.
transpose
(
x
,
(
0
,
2
,
1
,
3
))
# (batch, head, seq_length, head_features)
@
tf
.
function
@
tf
.
function
def
call
(
self
,
inputs
,
training
=
False
):
def
call
(
self
,
inputs
,
training
=
False
):
x
,
layer_past
,
head_mask
=
inputs
x
,
layer_past
,
attention_mask
,
head_mask
=
inputs
x
=
self
.
c_attn
(
x
)
x
=
self
.
c_attn
(
x
)
query
,
key
,
value
=
tf
.
split
(
x
,
3
,
axis
=
2
)
query
,
key
,
value
=
tf
.
split
(
x
,
3
,
axis
=
2
)
...
@@ -205,7 +209,7 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -205,7 +209,7 @@ class TFAttention(tf.keras.layers.Layer):
value
=
tf
.
concat
([
past_value
,
value
],
axis
=-
2
)
value
=
tf
.
concat
([
past_value
,
value
],
axis
=-
2
)
present
=
tf
.
stack
([
key
,
value
],
axis
=
1
)
present
=
tf
.
stack
([
key
,
value
],
axis
=
1
)
attn_outputs
=
self
.
_attn
(
query
,
key
,
value
,
head_mask
,
training
=
training
)
attn_outputs
=
self
.
_attn
(
[
query
,
key
,
value
,
attention_mask
,
head_mask
]
,
training
=
training
)
a
=
attn_outputs
[
0
]
a
=
attn_outputs
[
0
]
a
=
self
.
merge_heads
(
a
)
a
=
self
.
merge_heads
(
a
)
...
@@ -217,7 +221,7 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -217,7 +221,7 @@ class TFAttention(tf.keras.layers.Layer):
return
outputs
# a, present, (attentions)
return
outputs
# a, present, (attentions)
class
TFMLP
(
nn
.
Module
):
class
TFMLP
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
...
@@ -245,15 +249,16 @@ class TFBlock(tf.keras.layers.Layer):
...
@@ -245,15 +249,16 @@ class TFBlock(tf.keras.layers.Layer):
self
.
mlp
=
TFMLP
(
4
*
nx
,
config
,
name
=
'mlp'
)
self
.
mlp
=
TFMLP
(
4
*
nx
,
config
,
name
=
'mlp'
)
@
tf
.
function
@
tf
.
function
def
call
(
self
,
x
,
layer_past
=
None
,
head_mask
=
None
,
training
=
False
):
def
call
(
self
,
inputs
,
training
=
False
):
output_attn
=
self
.
attn
(
self
.
ln_1
(
x
),
x
,
layer_past
,
attention_mask
,
head_mask
=
inputs
layer_past
=
layer_past
,
head_mask
=
head_mask
,
training
=
training
)
a
=
output_attn
[
0
]
# output_attn: a, present, (attentions)
a
=
self
.
ln_1
(
x
)
output_attn
=
self
.
attn
([
a
,
layer_past
,
attention_mask
,
head_mask
],
training
=
training
)
a
=
output_attn
[
0
]
# output_attn: a, present, (attentions)
x
=
x
+
a
x
=
x
+
a
m
=
self
.
mlp
(
self
.
ln_2
(
x
),
training
=
training
)
m
=
self
.
ln_2
(
x
)
m
=
self
.
mlp
(
m
,
training
=
training
)
x
=
x
+
m
x
=
x
+
m
outputs
=
[
x
]
+
output_attn
[
1
:]
outputs
=
[
x
]
+
output_attn
[
1
:]
...
@@ -274,13 +279,13 @@ class TFGPT2Embeddings(tf.keras.layers.Layer):
...
@@ -274,13 +279,13 @@ class TFGPT2Embeddings(tf.keras.layers.Layer):
"""
"""
self
.
weight
=
self
.
add_weight
(
self
.
weight
=
self
.
add_weight
(
"weight"
,
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
n_embed
],
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
tf
.
random_normal_initializer
(
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
n_embed
**-
0.5
))
mean
=
0.
,
stddev
=
self
.
hidden_size
**-
0.5
))
super
(
TF
Bert
Embeddings
,
self
).
build
(
input_shape
)
super
(
TF
GPT2
Embeddings
,
self
).
build
(
input_shape
)
@
tf
.
function
@
tf
.
function
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
def
call
(
self
,
inputs
,
mode
=
"embedding"
):
"""Get token embeddings of inputs.
"""Get token embeddings of inputs.
Args:
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
...
@@ -296,7 +301,7 @@ class TFGPT2Embeddings(tf.keras.layers.Layer):
...
@@ -296,7 +301,7 @@ class TFGPT2Embeddings(tf.keras.layers.Layer):
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
"""
if
mode
==
"embedding"
:
if
mode
==
"embedding"
:
return
self
.
_embedding
(
inputs
,
training
=
training
)
return
self
.
_embedding
(
inputs
)
elif
mode
==
"linear"
:
elif
mode
==
"linear"
:
return
self
.
_linear
(
inputs
)
return
self
.
_linear
(
inputs
)
else
:
else
:
...
@@ -313,10 +318,10 @@ class TFGPT2Embeddings(tf.keras.layers.Layer):
...
@@ -313,10 +318,10 @@ class TFGPT2Embeddings(tf.keras.layers.Layer):
Returns:
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
float32 tensor with shape [batch_size, length, vocab_size].
"""
"""
batch_size
=
tf
.
shape
(
inputs
)[
0
]
batch_size
=
shape
_list
(
inputs
)[
0
]
length
=
tf
.
shape
(
inputs
)[
1
]
length
=
shape
_list
(
inputs
)[
1
]
x
=
tf
.
reshape
(
inputs
,
[
-
1
,
self
.
n_embed
])
x
=
tf
.
reshape
(
inputs
,
[
-
1
,
self
.
hidden_size
])
logits
=
tf
.
matmul
(
x
,
self
.
weight
,
transpose_b
=
True
)
logits
=
tf
.
matmul
(
x
,
self
.
weight
,
transpose_b
=
True
)
return
tf
.
reshape
(
logits
,
[
batch_size
,
length
,
self
.
vocab_size
])
return
tf
.
reshape
(
logits
,
[
batch_size
,
length
,
self
.
vocab_size
])
...
@@ -326,13 +331,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
...
@@ -326,13 +331,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
super
(
TFGPT2MainLayer
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFGPT2MainLayer
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
num_hidden_layers
=
config
.
n_layer
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
n_embd
=
config
.
n_embd
self
.
n_embd
=
config
.
n_embd
self
.
wte
=
TFGPT2Embeddings
(
config
,
name
=
'wte'
)
self
.
wte
=
TFGPT2Embeddings
(
config
,
name
=
'wte'
)
self
.
wpe
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
name
=
'wpe'
)
self
.
wpe
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
name
=
'wpe'
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
config
,
scale
=
Truename
=
'h_{}'
.
format
(
i
))
for
i
in
range
(
config
.
n_layer
)]
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
config
,
scale
=
True
,
name
=
'h_{}'
.
format
(
i
))
for
i
in
range
(
config
.
n_layer
)]
self
.
ln_f
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
'ln_f'
)
self
.
ln_f
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
'ln_f'
)
def
_resize_token_embeddings
(
self
,
new_num_tokens
):
def
_resize_token_embeddings
(
self
,
new_num_tokens
):
...
@@ -346,20 +352,20 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
...
@@ -346,20 +352,20 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
@
tf
.
function
@
tf
.
function
def
call
(
self
,
inputs
,
training
=
False
):
def
call
(
self
,
inputs
,
training
=
False
):
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
past
=
None
,
head_mask
=
None
):
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
input_ids
=
inputs
input_ids
=
inputs
attention_mask
,
head_mask
,
position_ids
,
token_type_ids
=
None
,
None
,
None
,
None
past
,
attention_mask
,
token_type_ids
,
position_ids
,
head_mask
=
None
,
None
,
None
,
None
,
None
elif
isinstance
(
inputs
,
(
tuple
,
list
)):
elif
isinstance
(
inputs
,
(
tuple
,
list
)):
input_ids
=
inputs
[
0
]
input_ids
=
inputs
[
0
]
attention_mask
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
None
past
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
None
token_type_ids
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
None
attention_mask
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
None
position_ids
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
None
token_type_ids
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
None
head_mask
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
None
position_ids
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
None
assert
len
(
inputs
)
<=
5
,
"Too many inputs."
head_mask
=
inputs
[
5
]
if
len
(
inputs
)
>
5
else
None
assert
len
(
inputs
)
<=
6
,
"Too many inputs."
else
:
else
:
input_ids
=
inputs
.
get
(
'input_ids'
)
input_ids
=
inputs
.
get
(
'input_ids'
)
past
=
inputs
.
get
(
'past'
,
None
)
attention_mask
=
inputs
.
get
(
'attention_mask'
,
None
)
attention_mask
=
inputs
.
get
(
'attention_mask'
,
None
)
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
position_ids
=
inputs
.
get
(
'position_ids'
,
None
)
position_ids
=
inputs
.
get
(
'position_ids'
,
None
)
...
@@ -370,49 +376,66 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
...
@@ -370,49 +376,66 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
past_length
=
0
past_length
=
0
past
=
[
None
]
*
len
(
self
.
h
)
past
=
[
None
]
*
len
(
self
.
h
)
else
:
else
:
past_length
=
past
[
0
][
0
]
.
size
(
-
2
)
past_length
=
shape_list
(
past
[
0
][
0
]
)[
-
2
]
if
position_ids
is
None
:
if
position_ids
is
None
:
position_ids
=
torch
.
arange
(
past_length
,
input_ids
.
size
(
-
1
)
+
past_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
tf
.
range
(
past_length
,
shape_list
(
input_ids
)[
-
1
]
+
past_length
,
dtype
=
tf
.
int32
)[
tf
.
newaxis
,
:]
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
if
attention_mask
is
not
None
:
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
attention_mask
=
attention_mask
[:,
tf
.
newaxis
,
tf
.
newaxis
,
:]
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_mask
=
tf
.
cast
(
attention_mask
,
tf
.
float32
)
attention_mask
=
(
1.0
-
attention_mask
)
*
-
10000.0
else
:
attention_mask
=
None
# Prepare head mask if needed
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# attention_probs has shape bsz x n_heads x N x N
# head_mask has shape n_layer x batch x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
if
head_mask
is
not
None
:
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if
head_mask
.
dim
()
==
1
:
if
not
head_mask
is
None
:
head_mask
=
head_mask
.
unsqueeze
(
0
).
unsqueeze
(
0
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
raise
NotImplementedError
head_mask
=
head_mask
.
expand
(
self
.
config
.
n_layer
,
-
1
,
-
1
,
-
1
,
-
1
)
elif
head_mask
.
dim
()
==
2
:
head_mask
=
head_mask
.
unsqueeze
(
1
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
# We can specify head_mask for each layer
head_mask
=
head_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# switch to fload if need + fp16 compatibility
else
:
else
:
head_mask
=
[
None
]
*
self
.
config
.
n_layer
head_mask
=
[
None
]
*
self
.
num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers)
input_shape
=
input_ids
.
size
(
)
input_shape
=
shape_list
(
input_ids
)
input_ids
=
input_ids
.
view
(
-
1
,
input_
ids
.
size
(
-
1
)
)
input_ids
=
tf
.
reshape
(
input_ids
,
[
-
1
,
input_
shape
[
-
1
]]
)
position_ids
=
position_ids
.
view
(
-
1
,
position_ids
.
size
(
-
1
)
)
position_ids
=
tf
.
reshape
(
position_ids
,
[
-
1
,
shape_list
(
position_ids
)[
-
1
]]
)
inputs_embeds
=
self
.
wte
(
input_ids
)
inputs_embeds
=
self
.
wte
(
input_ids
,
mode
=
'embedding'
)
position_embeds
=
self
.
wpe
(
position_ids
)
position_embeds
=
self
.
wpe
(
position_ids
)
if
token_type_ids
is
not
None
:
if
token_type_ids
is
not
None
:
token_type_ids
=
token_type_ids
.
view
(
-
1
,
token_type_ids
.
size
(
-
1
)
)
token_type_ids
=
tf
.
reshape
(
token_type_ids
,
[
-
1
,
shape_list
(
token_type_ids
)[
-
1
]]
)
token_type_embeds
=
self
.
wte
(
token_type_ids
)
token_type_embeds
=
self
.
wte
(
token_type_ids
,
mode
=
'embedding'
)
else
:
else
:
token_type_embeds
=
0
token_type_embeds
=
0
hidden_states
=
inputs_embeds
+
position_embeds
+
token_type_embeds
hidden_states
=
inputs_embeds
+
position_embeds
+
token_type_embeds
if
training
:
hidden_states
=
self
.
drop
(
hidden_states
)
hidden_states
=
self
.
drop
(
hidden_states
)
output_shape
=
input_shape
+
(
hidden_states
.
size
(
-
1
),)
output_shape
=
input_shape
+
[
shape_list
(
hidden_states
)[
-
1
]]
presents
=
()
presents
=
()
all_attentions
=
[]
all_attentions
=
[]
all_hidden_states
=
()
all_hidden_states
=
()
for
i
,
(
block
,
layer_past
)
in
enumerate
(
zip
(
self
.
h
,
past
)):
for
i
,
(
block
,
layer_past
)
in
enumerate
(
zip
(
self
.
h
,
past
)):
if
self
.
output_hidden_states
:
if
self
.
output_hidden_states
:
all_hidden_states
=
all_hidden_states
+
(
hidden_states
.
view
(
*
output_shape
),)
all_hidden_states
=
all_hidden_states
+
(
tf
.
reshape
(
hidden_states
,
output_shape
),)
outputs
=
block
([
hidden_states
,
layer_past
,
attention_mask
,
head_mask
[
i
]],
training
=
training
)
outputs
=
block
(
hidden_states
,
layer_past
,
head_mask
[
i
])
hidden_states
,
present
=
outputs
[:
2
]
hidden_states
,
present
=
outputs
[:
2
]
presents
=
presents
+
(
present
,)
presents
=
presents
+
(
present
,)
...
@@ -421,7 +444,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
...
@@ -421,7 +444,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
hidden_states
=
self
.
ln_f
(
hidden_states
)
hidden_states
=
self
.
ln_f
(
hidden_states
)
hidden_states
=
hidden_states
.
view
(
*
output_shape
)
hidden_states
=
tf
.
reshape
(
hidden_states
,
output_shape
)
# Add last hidden state
# Add last hidden state
if
self
.
output_hidden_states
:
if
self
.
output_hidden_states
:
all_hidden_states
=
all_hidden_states
+
(
hidden_states
,)
all_hidden_states
=
all_hidden_states
+
(
hidden_states
,)
...
@@ -431,18 +454,19 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
...
@@ -431,18 +454,19 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
outputs
=
outputs
+
(
all_hidden_states
,)
outputs
=
outputs
+
(
all_hidden_states
,)
if
self
.
output_attentions
:
if
self
.
output_attentions
:
# let the number of heads free (-1) so we can extract attention even after head pruning
# let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape
=
input_shape
[:
-
1
]
+
(
-
1
,)
+
all_attentions
[
0
]
.
shape
[
-
2
:]
attention_output_shape
=
input_shape
[:
-
1
]
+
[
-
1
]
+
shape_list
(
all_attentions
[
0
]
)
[
-
2
:]
all_attentions
=
tuple
(
t
.
view
(
*
attention_output_shape
)
for
t
in
all_attentions
)
all_attentions
=
tuple
(
t
f
.
reshape
(
t
,
attention_output_shape
)
for
t
in
all_attentions
)
outputs
=
outputs
+
(
all_attentions
,)
outputs
=
outputs
+
(
all_attentions
,)
return
outputs
# last hidden state, presents, (all hidden_states), (attentions)
return
outputs
# last hidden state, presents, (all hidden_states), (attentions)
class
TFGPT2PreTrainedModel
(
TFPreTrainedModel
):
class
TFGPT2PreTrainedModel
(
TFPreTrainedModel
):
""" An abstract class to handle weights initialization and
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
a simple interface for dowloading and loading pretrained models.
"""
"""
config_class
=
GPT2Config
config_class
=
GPT2Config
pretrained_model_archive_map
=
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
pretrained_model_archive_map
=
TF_
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
load_t
f
_weights
=
load_t
f
_weights_in_
gpt2
load_
p
t_weights
=
load_
gpt2_p
t_weights_in_
tf
base_model_prefix
=
"transformer"
base_model_prefix
=
"transformer"
...
@@ -487,17 +511,21 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
...
@@ -487,17 +511,21 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**past**:
**past**:
list of ``torch.FloatTensor`` (one for each layer):
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding.
(see `past` output below). Can be used to speed up sequential decoding.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
...
@@ -526,7 +554,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
...
@@ -526,7 +554,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
Examples::
Examples::
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model =
TF
GPT2Model.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
...
@@ -534,149 +562,19 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
...
@@ -534,149 +562,19 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2Model
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFGPT2Model
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
'transformer'
)
self
.
output_attentions
=
config
.
output_attentions
self
.
vocab_size
=
config
.
vocab_size
self
.
n_embd
=
config
.
n_embd
self
.
wpe
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
name
=
'wpe'
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
config
,
scale
=
Truename
=
'h_{}'
.
format
(
i
))
for
i
in
range
(
config
.
n_layer
)]
self
.
ln_f
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
'ln_f'
)
self
.
init_weights
()
def
build
(
self
,
input_shape
):
"""Build shared word embedding layer
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
with
tf
.
name_scope
(
"wte"
):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self
.
wte
=
self
.
add_weight
(
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
n_embed
],
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
n_embed
**-
0.5
))
super
(
TFGPT2Model
,
self
).
build
(
input_shape
)
def
_resize_token_embeddings
(
self
,
new_num_tokens
):
raise
NotImplementedError
def
_prune_heads
(
self
,
heads_to_prune
):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
raise
NotImplementedError
@
tf
.
function
@
tf
.
function
def
call
(
self
,
inputs
,
training
=
False
):
def
call
(
self
,
inputs
,
training
=
False
):
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
past
=
None
,
head_mask
=
None
):
outputs
=
self
.
transformer
(
inputs
,
training
=
training
)
return
outputs
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
input_ids
=
inputs
attention_mask
,
head_mask
,
position_ids
,
token_type_ids
=
None
,
None
,
None
,
None
elif
isinstance
(
inputs
,
(
tuple
,
list
)):
input_ids
=
inputs
[
0
]
attention_mask
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
None
token_type_ids
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
None
position_ids
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
None
head_mask
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
None
assert
len
(
inputs
)
<=
5
,
"Too many inputs."
else
:
input_ids
=
inputs
.
get
(
'input_ids'
)
attention_mask
=
inputs
.
get
(
'attention_mask'
,
None
)
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
position_ids
=
inputs
.
get
(
'position_ids'
,
None
)
head_mask
=
inputs
.
get
(
'head_mask'
,
None
)
assert
len
(
inputs
)
<=
5
,
"Too many inputs."
if
past
is
None
:
past_length
=
0
past
=
[
None
]
*
len
(
self
.
h
)
else
:
past_length
=
past
[
0
][
0
].
size
(
-
2
)
if
position_ids
is
None
:
position_ids
=
torch
.
arange
(
past_length
,
input_ids
.
size
(
-
1
)
+
past_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# head_mask has shape n_layer x batch x n_heads x N x N
if
head_mask
is
not
None
:
if
head_mask
.
dim
()
==
1
:
head_mask
=
head_mask
.
unsqueeze
(
0
).
unsqueeze
(
0
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
head_mask
=
head_mask
.
expand
(
self
.
config
.
n_layer
,
-
1
,
-
1
,
-
1
,
-
1
)
elif
head_mask
.
dim
()
==
2
:
head_mask
=
head_mask
.
unsqueeze
(
1
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
# We can specify head_mask for each layer
head_mask
=
head_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# switch to fload if need + fp16 compatibility
else
:
head_mask
=
[
None
]
*
self
.
config
.
n_layer
input_shape
=
input_ids
.
size
()
input_ids
=
input_ids
.
view
(
-
1
,
input_ids
.
size
(
-
1
))
position_ids
=
position_ids
.
view
(
-
1
,
position_ids
.
size
(
-
1
))
inputs_embeds
=
self
.
wte
(
input_ids
)
position_embeds
=
self
.
wpe
(
position_ids
)
if
token_type_ids
is
not
None
:
token_type_ids
=
token_type_ids
.
view
(
-
1
,
token_type_ids
.
size
(
-
1
))
token_type_embeds
=
self
.
wte
(
token_type_ids
)
else
:
token_type_embeds
=
0
hidden_states
=
inputs_embeds
+
position_embeds
+
token_type_embeds
hidden_states
=
self
.
drop
(
hidden_states
)
output_shape
=
input_shape
+
(
hidden_states
.
size
(
-
1
),)
presents
=
()
all_attentions
=
[]
all_hidden_states
=
()
for
i
,
(
block
,
layer_past
)
in
enumerate
(
zip
(
self
.
h
,
past
)):
if
self
.
output_hidden_states
:
all_hidden_states
=
all_hidden_states
+
(
hidden_states
.
view
(
*
output_shape
),)
outputs
=
block
(
hidden_states
,
layer_past
,
head_mask
[
i
])
hidden_states
,
present
=
outputs
[:
2
]
presents
=
presents
+
(
present
,)
if
self
.
output_attentions
:
all_attentions
.
append
(
outputs
[
2
])
hidden_states
=
self
.
ln_f
(
hidden_states
)
hidden_states
=
hidden_states
.
view
(
*
output_shape
)
# Add last hidden state
if
self
.
output_hidden_states
:
all_hidden_states
=
all_hidden_states
+
(
hidden_states
,)
outputs
=
(
hidden_states
,
presents
)
if
self
.
output_hidden_states
:
outputs
=
outputs
+
(
all_hidden_states
,)
if
self
.
output_attentions
:
# let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape
=
input_shape
[:
-
1
]
+
(
-
1
,)
+
all_attentions
[
0
].
shape
[
-
2
:]
all_attentions
=
tuple
(
t
.
view
(
*
attention_output_shape
)
for
t
in
all_attentions
)
outputs
=
outputs
+
(
all_attentions
,)
return
outputs
# last hidden state, presents, (all hidden_states), (attentions)
@
add_start_docstrings
(
"""The GPT2 Model transformer with a language modeling head on top
@
add_start_docstrings
(
"""The GPT2 Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """
,
GPT2_START_DOCSTRING
,
GPT2_INPUTS_DOCSTRING
)
(linear layer with weights tied to the input embeddings). """
,
GPT2_START_DOCSTRING
,
GPT2_INPUTS_DOCSTRING
)
class
GPT2LMHeadModel
(
GPT2PreTrainedModel
):
class
TF
GPT2LMHeadModel
(
TF
GPT2PreTrainedModel
):
r
"""
r
"""
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**past**:
**past**:
...
@@ -700,93 +598,38 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
...
@@ -700,93 +598,38 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
outputs = model(input_ids)
loss, logits = outputs[:2]
logits = outputs[:2]
"""
def
__init__
(
self
,
config
):
super
(
GPT2LMHeadModel
,
self
).
__init__
(
config
)
self
.
transformer
=
GPT2Model
(
config
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
self
.
init_weights
()
self
.
tie_weights
()
def
tie_weights
(
self
):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
"""
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
self
.
transformer
.
wte
)
super
(
TFGPT2LMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
'transformer'
)
def
forward
(
self
,
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
labels
=
None
,
past
=
None
,
head_mask
=
None
):
@
tf
.
function
transformer_outputs
=
self
.
transformer
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
def
call
(
self
,
inputs
,
training
=
False
):
past
=
past
,
head_mask
=
head_mask
)
transformer_outputs
=
self
.
transformer
(
inputs
,
training
=
training
)
hidden_states
=
transformer_outputs
[
0
]
hidden_states
=
transformer_outputs
[
0
]
lm_logits
=
self
.
lm_head
(
hidden_states
)
lm_logits
=
self
.
transformer
.
wte
(
hidden_states
,
mode
=
"linear"
)
outputs
=
(
lm_logits
,)
+
transformer_outputs
[
1
:]
outputs
=
(
lm_logits
,)
+
transformer_outputs
[
1
:]
if
labels
is
not
None
:
# Shift so that tokens < n predict n
shift_logits
=
lm_logits
[...,
:
-
1
,
:].
contiguous
()
shift_labels
=
labels
[...,
1
:].
contiguous
()
# Flatten the tokens
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
loss
=
loss_fct
(
shift_logits
.
view
(
-
1
,
shift_logits
.
size
(
-
1
)),
shift_labels
.
view
(
-
1
))
outputs
=
(
loss
,)
+
outputs
return
outputs
#
(loss),
lm_logits, presents, (all hidden_states), (attentions)
return
outputs
# lm_logits, presents, (all hidden_states), (attentions)
@
add_start_docstrings
(
"""The GPT2 Model transformer with a language modeling and a multiple-choice classification
@
add_start_docstrings
(
"""The GPT2 Model transformer with a language modeling and a multiple-choice classification
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings,
The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence).
the classification head takes as input the input of a specified classification token index in the input sequence).
"""
,
GPT2_START_DOCSTRING
)
"""
,
GPT2_START_DOCSTRING
,
GPT2_INPUTS_DOCSTRING
)
class
GPT2DoubleHeadsModel
(
GPT2PreTrainedModel
):
class
TFGPT2DoubleHeadsModel
(
TFGPT2PreTrainedModel
):
r
""" Inputs:
r
"""
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
**mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Index of the classification token in each input sequence.
Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``.
Selected in the range ``[0, input_ids.size(-1) - 1[``.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**past**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
**mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss.
**mc_loss**: (`optional`, returned when ``multiple_choice_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Multiple choice classification loss.
**lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
**lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
...
@@ -827,43 +670,52 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
...
@@ -827,43 +670,52 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
lm_prediction_scores, mc_prediction_scores = outputs[:2]
lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
GPT2DoubleHeadsModel
,
self
).
__init__
(
config
)
super
(
TFGPT2DoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
GPT2Model
(
config
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
'transformer'
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
config
,
name
=
'multiple_choice_head'
)
self
.
multiple_choice_head
=
SequenceSummary
(
config
)
self
.
init_weights
()
@
tf
.
function
self
.
tie_weights
()
def
call
(
self
,
inputs
,
training
=
False
):
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
def
tie_weights
(
self
):
raise
ValueError
(
"Inputs should be a list or a dict with at least two elements: 'inputs_ids' and 'mc_token_ids'"
)
""" Make sure we are sharing the input and output embeddings.
elif
isinstance
(
inputs
,
(
tuple
,
list
)):
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
input_ids
=
inputs
[
0
]
"""
mc_token_ids
=
inputs
[
1
]
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
past
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
None
self
.
transformer
.
wte
)
attention_mask
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
None
token_type_ids
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
None
position_ids
=
inputs
[
5
]
if
len
(
inputs
)
>
5
else
None
head_mask
=
inputs
[
6
]
if
len
(
inputs
)
>
6
else
None
assert
len
(
inputs
)
<=
7
,
"Too many inputs."
else
:
input_ids
=
inputs
.
get
(
'input_ids'
)
mc_token_ids
=
inputs
.
get
(
'mc_token_ids'
)
past
=
inputs
.
get
(
'past'
,
None
)
attention_mask
=
inputs
.
get
(
'attention_mask'
,
None
)
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
position_ids
=
inputs
.
get
(
'position_ids'
,
None
)
head_mask
=
inputs
.
get
(
'head_mask'
,
None
)
assert
len
(
inputs
)
<=
5
,
"Too many inputs."
num_choices
=
shape_list
(
input_ids
)[
1
]
seq_length
=
shape_list
(
input_ids
)[
2
]
flat_input_ids
=
tf
.
reshape
(
input_ids
,
(
-
1
,
seq_length
))
flat_attention_mask
=
tf
.
reshape
(
attention_mask
,
(
-
1
,
seq_length
))
if
attention_mask
is
not
None
else
None
flat_token_type_ids
=
tf
.
reshape
(
token_type_ids
,
(
-
1
,
seq_length
))
if
token_type_ids
is
not
None
else
None
flat_position_ids
=
tf
.
reshape
(
position_ids
,
(
-
1
,
seq_length
))
if
position_ids
is
not
None
else
None
flat_inputs
=
[
flat_input_ids
,
past
,
flat_attention_mask
,
flat_token_type_ids
,
flat_position_ids
,
head_mask
]
outputs
=
self
.
transformer
(
flat_inputs
,
training
=
training
)
def
forward
(
self
,
input_ids
,
mc_token_ids
=
None
,
lm_labels
=
None
,
mc_labels
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
past
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
past
=
past
,
head_mask
=
head_mask
)
hidden_states
=
transformer_outputs
[
0
]
hidden_states
=
transformer_outputs
[
0
]
lm_logits
=
self
.
lm_head
(
hidden_states
)
lm_logits
=
self
.
transformer
.
wte
(
hidden_states
,
mode
=
"linear"
)
mc_logits
=
self
.
multiple_choice_head
(
hidden_states
,
mc_token_ids
).
squeeze
(
-
1
)
mc_logits
=
self
.
multiple_choice_head
(
[
hidden_states
,
mc_token_ids
],
training
=
training
)
outputs
=
(
lm_logits
,
mc_logits
)
+
transformer_outputs
[
1
:]
outputs
=
(
lm_logits
,
mc_logits
)
+
transformer_outputs
[
1
:]
if
mc_labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
mc_logits
.
view
(
-
1
,
mc_logits
.
size
(
-
1
)),
mc_labels
.
view
(
-
1
))
outputs
=
(
loss
,)
+
outputs
if
lm_labels
is
not
None
:
shift_logits
=
lm_logits
[...,
:
-
1
,
:].
contiguous
()
shift_labels
=
lm_labels
[...,
1
:].
contiguous
()
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
loss
=
loss_fct
(
shift_logits
.
view
(
-
1
,
shift_logits
.
size
(
-
1
)),
shift_labels
.
view
(
-
1
))
outputs
=
(
loss
,)
+
outputs
return
outputs
# (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
return
outputs
# (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
pytorch_transformers/modeling_tf_utils.py
View file @
72863735
...
@@ -273,15 +273,117 @@ class TFConv1D(tf.keras.layers.Layer):
...
@@ -273,15 +273,117 @@ class TFConv1D(tf.keras.layers.Layer):
mean
=
0.
,
stddev
=
0.02
))
mean
=
0.
,
stddev
=
0.02
))
self
.
bias
=
self
.
add_weight
(
self
.
bias
=
self
.
add_weight
(
"bias"
,
"bias"
,
shape
=
[
self
.
nx
,
self
.
nf
],
shape
=
[
1
,
self
.
nf
],
initializer
=
tf
.
zeros_initializer
())
initializer
=
tf
.
zeros_initializer
())
@
tf
.
function
@
tf
.
function
def
call
(
self
,
x
):
def
call
(
self
,
x
):
size_out
=
tf
.
shape
(
x
)[:
-
1
]
+
(
self
.
nf
,)
bz
,
sl
=
shape_list
(
x
)[:
2
]
x
=
tf
.
reshape
(
x
,
[
-
1
,
tf
.
shape
(
x
)[
-
1
]
])
x
=
tf
.
reshape
(
x
,
[
-
1
,
self
.
nx
])
x
=
tf
.
matmul
(
x
,
self
.
weight
)
+
self
.
bias
x
=
tf
.
matmul
(
x
,
self
.
weight
)
+
self
.
bias
x
=
tf
.
reshape
(
x
,
size_out
)
x
=
tf
.
reshape
(
x
,
[
bz
,
sl
,
self
.
nf
])
return
x
return
x
class
TFSequenceSummary
(
tf
.
keras
.
layers
.
Layer
):
r
""" Compute a single vector summary of a sequence hidden states according to various possibilities:
Args of the config class:
summary_type:
- 'last' => [default] take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj: Add a projection after the vector extraction
summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
summary_first_dropout: Add a dropout before the projection and activation
summary_last_dropout: Add a dropout after the projection and activation
"""
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFSequenceSummary
,
self
).
__init__
(
**
kwargs
)
self
.
summary_type
=
config
.
summary_type
if
hasattr
(
config
,
'summary_use_proj'
)
else
'last'
if
self
.
summary_type
==
'attn'
:
# We should use a standard multi-head attention module with absolute positional embedding for that.
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
# We can probably just use the multi-head attention module of PyTorch >=1.1.0
raise
NotImplementedError
self
.
summary
=
tf
.
keras
.
layers
.
Identity
(
name
=
'summary'
)
if
hasattr
(
config
,
'summary_use_proj'
)
and
config
.
summary_use_proj
:
if
hasattr
(
config
,
'summary_proj_to_labels'
)
and
config
.
summary_proj_to_labels
and
config
.
num_labels
>
0
:
num_classes
=
config
.
num_labels
else
:
num_classes
=
config
.
hidden_size
self
.
summary
=
tf
.
keras
.
layers
.
Dense
(
num_classes
,
name
=
'summary'
)
self
.
activation
=
None
if
hasattr
(
config
,
'summary_activation'
)
and
config
.
summary_activation
==
'tanh'
:
self
.
activation
=
tf
.
keras
.
layers
.
Tanh
()
self
.
first_dropout
=
None
if
hasattr
(
config
,
'summary_first_dropout'
)
and
config
.
summary_first_dropout
>
0
:
self
.
first_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
summary_first_dropout
)
self
.
last_dropout
=
None
if
hasattr
(
config
,
'summary_last_dropout'
)
and
config
.
summary_last_dropout
>
0
:
self
.
last_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
summary_last_dropout
)
@
tf
.
function
def
call
(
self
,
inputs
,
training
=
False
):
""" hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
cls_index: [optional] position of the classification token if summary_type == 'cls_index',
shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
if summary_type == 'cls_index' and cls_index is None:
we take the last token of the sequence as classification token
"""
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
hidden_states
=
inputs
cls_index
=
None
elif
isinstance
(
inputs
,
(
tuple
,
list
)):
hidden_states
=
inputs
[
0
]
cls_index
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
None
assert
len
(
inputs
)
<=
2
,
"Too many inputs."
else
:
input_ids
=
inputs
.
get
(
'input_ids'
)
cls_index
=
inputs
.
get
(
'cls_index'
,
None
)
if
self
.
summary_type
==
'last'
:
output
=
hidden_states
[:,
-
1
]
elif
self
.
summary_type
==
'first'
:
output
=
hidden_states
[:,
0
]
elif
self
.
summary_type
==
'mean'
:
output
=
tf
.
mean
(
hidden_states
,
axis
=
1
)
elif
self
.
summary_type
==
'cls_index'
:
if
cls_index
is
None
:
cls_index
=
tf
.
fill
(
tf
.
shape
(
hidden_states
[...,
:
1
,
:]),
hidden_states
.
shape
[
-
2
]
-
1
,
dtype
=
tf
.
int32
)
else
:
cls_index
=
cls_index
[...,
tf
.
newaxis
,
tf
.
newaxis
]
cls_index
=
cls_index
.
expand
((
-
1
,)
*
(
cls_index
.
dim
()
-
1
)
+
(
hidden_states
.
size
(
-
1
),))
# shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
output
=
hidden_states
.
gather
(
-
2
,
cls_index
).
squeeze
(
-
2
)
# shape (bsz, XX, hidden_size)
elif
self
.
summary_type
==
'attn'
:
raise
NotImplementedError
if
training
and
self
.
first_dropout
is
not
None
:
output
=
self
.
first_dropout
(
output
)
output
=
self
.
summary
(
output
)
if
self
.
activation
is
not
None
:
output
=
self
.
activation
(
output
)
if
training
and
self
.
last_dropout
is
not
None
:
output
=
self
.
last_dropout
(
output
)
return
output
def
shape_list
(
x
):
"""Deal with dynamic shape in tensorflow cleanly."""
static
=
x
.
shape
.
as_list
()
dynamic
=
tf
.
shape
(
x
)
return
[
dynamic
[
i
]
if
s
is
None
else
s
for
i
,
s
in
enumerate
(
static
)]
pytorch_transformers/tests/modeling_gpt2_test.py
View file @
72863735
...
@@ -44,6 +44,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
...
@@ -44,6 +44,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
seq_length
=
7
,
seq_length
=
7
,
is_training
=
True
,
is_training
=
True
,
use_token_type_ids
=
True
,
use_token_type_ids
=
True
,
use_input_mask
=
True
,
use_labels
=
True
,
use_labels
=
True
,
vocab_size
=
99
,
vocab_size
=
99
,
hidden_size
=
32
,
hidden_size
=
32
,
...
@@ -66,6 +67,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
...
@@ -66,6 +67,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
self
.
seq_length
=
seq_length
self
.
seq_length
=
seq_length
self
.
is_training
=
is_training
self
.
is_training
=
is_training
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_input_mask
=
use_input_mask
self
.
use_labels
=
use_labels
self
.
use_labels
=
use_labels
self
.
vocab_size
=
vocab_size
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
hidden_size
=
hidden_size
...
@@ -86,6 +88,10 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
...
@@ -86,6 +88,10 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
def
prepare_config_and_inputs
(
self
):
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_mask
=
None
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
token_type_ids
=
None
token_type_ids
=
None
if
self
.
use_token_type_ids
:
if
self
.
use_token_type_ids
:
token_type_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
token_type_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
...
@@ -115,14 +121,14 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
...
@@ -115,14 +121,14 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
head_mask
=
ids_tensor
([
self
.
num_hidden_layers
,
self
.
num_attention_heads
],
2
)
head_mask
=
ids_tensor
([
self
.
num_hidden_layers
,
self
.
num_attention_heads
],
2
)
return
config
,
input_ids
,
head_mask
,
token_type_ids
,
sequence_labels
,
token_labels
,
choice_labels
return
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
sequence_labels
,
token_labels
,
choice_labels
def
check_loss_output
(
self
,
result
):
def
check_loss_output
(
self
,
result
):
self
.
parent
.
assertListEqual
(
self
.
parent
.
assertListEqual
(
list
(
result
[
"loss"
].
size
()),
list
(
result
[
"loss"
].
size
()),
[])
[])
def
create_and_check_gpt2_model
(
self
,
config
,
input_ids
,
head_mask
,
token_type_ids
,
*
args
):
def
create_and_check_gpt2_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
GPT2Model
(
config
=
config
)
model
=
GPT2Model
(
config
=
config
)
model
.
eval
()
model
.
eval
()
...
@@ -139,7 +145,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
...
@@ -139,7 +145,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
self
.
parent
.
assertEqual
(
len
(
result
[
"presents"
]),
config
.
n_layer
)
self
.
parent
.
assertEqual
(
len
(
result
[
"presents"
]),
config
.
n_layer
)
def
create_and_check_lm_head_model
(
self
,
config
,
input_ids
,
head_mask
,
token_type_ids
,
*
args
):
def
create_and_check_lm_head_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
GPT2LMHeadModel
(
config
)
model
=
GPT2LMHeadModel
(
config
)
model
.
eval
()
model
.
eval
()
...
@@ -157,7 +163,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
...
@@ -157,7 +163,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
list
(
result
[
"lm_logits"
].
size
()),
list
(
result
[
"lm_logits"
].
size
()),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
def
create_and_check_double_lm_head_model
(
self
,
config
,
input_ids
,
head_mask
,
token_type_ids
,
*
args
):
def
create_and_check_double_lm_head_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
GPT2DoubleHeadsModel
(
config
)
model
=
GPT2DoubleHeadsModel
(
config
)
model
.
eval
()
model
.
eval
()
...
@@ -177,7 +183,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
...
@@ -177,7 +183,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
def
prepare_config_and_inputs_for_common
(
self
):
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
head_mask
,
token_type_ids
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
(
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
inputs_dict
=
{
inputs_dict
=
{
'input_ids'
:
input_ids
,
'input_ids'
:
input_ids
,
'token_type_ids'
:
token_type_ids
,
'token_type_ids'
:
token_type_ids
,
...
...
pytorch_transformers/tests/modeling_tf_gpt2_test.py
0 → 100644
View file @
72863735
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
shutil
import
pytest
import
sys
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
pytorch_transformers
import
GPT2Config
,
is_tf_available
try
:
import
tensorflow
as
tf
from
pytorch_transformers.modeling_tf_gpt2
import
(
TFGPT2Model
,
TFGPT2LMHeadModel
,
TFGPT2DoubleHeadsModel
,
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
)
except
ImportError
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
class
TFGPT2ModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFGPT2Model
,
TFGPT2LMHeadModel
,
TFGPT2DoubleHeadsModel
)
if
is_tf_available
()
else
()
class
TFGPT2ModelTester
(
object
):
def
__init__
(
self
,
parent
,
batch_size
=
13
,
seq_length
=
7
,
is_training
=
True
,
use_token_type_ids
=
True
,
use_input_mask
=
True
,
use_labels
=
True
,
vocab_size
=
99
,
hidden_size
=
32
,
num_hidden_layers
=
5
,
num_attention_heads
=
4
,
intermediate_size
=
37
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
16
,
type_sequence_label_size
=
2
,
initializer_range
=
0.02
,
num_labels
=
3
,
num_choices
=
4
,
scope
=
None
,
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
is_training
=
is_training
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_input_mask
=
use_input_mask
self
.
use_labels
=
use_labels
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
intermediate_size
=
intermediate_size
self
.
hidden_act
=
hidden_act
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
type_sequence_label_size
=
type_sequence_label_size
self
.
initializer_range
=
initializer_range
self
.
num_labels
=
num_labels
self
.
num_choices
=
num_choices
self
.
scope
=
scope
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_mask
=
None
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
token_type_ids
=
None
if
self
.
use_token_type_ids
:
token_type_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
sequence_labels
=
None
token_labels
=
None
choice_labels
=
None
if
self
.
use_labels
:
sequence_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
config
=
GPT2Config
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
n_embd
=
self
.
hidden_size
,
n_layer
=
self
.
num_hidden_layers
,
n_head
=
self
.
num_attention_heads
,
# intermediate_size=self.intermediate_size,
# hidden_act=self.hidden_act,
# hidden_dropout_prob=self.hidden_dropout_prob,
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
n_positions
=
self
.
max_position_embeddings
,
n_ctx
=
self
.
max_position_embeddings
# type_vocab_size=self.type_vocab_size,
# initializer_range=self.initializer_range
)
head_mask
=
ids_tensor
([
self
.
num_hidden_layers
,
self
.
num_attention_heads
],
2
)
return
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
sequence_labels
,
token_labels
,
choice_labels
def
create_and_check_gpt2_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
TFGPT2Model
(
config
=
config
)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
sequence_output
=
model
(
inputs
)[
0
]
inputs
=
[
input_ids
,
None
,
input_mask
]
# None is the input for 'past'
sequence_output
=
model
(
inputs
)[
0
]
sequence_output
=
model
(
input_ids
)[
0
]
result
=
{
"sequence_output"
:
sequence_output
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"sequence_output"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
def
create_and_check_gpt2_lm_head
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
TFGPT2LMHeadModel
(
config
=
config
)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
prediction_scores
=
model
(
inputs
)[
0
]
result
=
{
"prediction_scores"
:
prediction_scores
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"prediction_scores"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
def
create_and_check_gpt2_double_head
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
pass
# model = TFGPT2DoubleHeadsModel(config=config)
# inputs = {'input_ids': input_ids,
# 'attention_mask': input_mask,
# 'token_type_ids': token_type_ids}
# seq_relationship_score, = model(inputs)[0]
# result = {
# "seq_relationship_score": seq_relationship_score.numpy(),
# }
# self.parent.assertListEqual(
# list(result["seq_relationship_score"].shape),
# [self.batch_size, 2])
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
inputs_dict
=
{
'input_ids'
:
input_ids
,
'token_type_ids'
:
token_type_ids
,
'attention_mask'
:
input_mask
}
return
config
,
inputs_dict
def
setUp
(
self
):
self
.
model_tester
=
TFGPT2ModelTest
.
TFGPT2ModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
GPT2Config
,
hidden_size
=
37
)
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
def
test_gpt2_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_gpt2_model
(
*
config_and_inputs
)
def
test_gpt2_lm_head
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_gpt2_lm_head
(
*
config_and_inputs
)
def
test_gpt2_double_head
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_gpt2_double_head
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/pytorch_transformers_test/"
for
model_name
in
list
(
TF_gpt2_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
model
=
TFGPT2Model
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment