Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e25cba78
Commit
e25cba78
authored
Sep 04, 2019
by
thomwolf
Browse files
WIP reodering arguments for torchscript and TF
parent
38b79b5a
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
331 additions
and
253 deletions
+331
-253
pytorch_transformers/modeling_bert.py
pytorch_transformers/modeling_bert.py
+74
-77
pytorch_transformers/modeling_distilbert.py
pytorch_transformers/modeling_distilbert.py
+9
-9
pytorch_transformers/modeling_gpt2.py
pytorch_transformers/modeling_gpt2.py
+65
-44
pytorch_transformers/modeling_openai.py
pytorch_transformers/modeling_openai.py
+49
-34
pytorch_transformers/modeling_roberta.py
pytorch_transformers/modeling_roberta.py
+33
-14
pytorch_transformers/modeling_transfo_xl.py
pytorch_transformers/modeling_transfo_xl.py
+1
-1
pytorch_transformers/modeling_xlm.py
pytorch_transformers/modeling_xlm.py
+44
-29
pytorch_transformers/modeling_xlnet.py
pytorch_transformers/modeling_xlnet.py
+41
-32
pytorch_transformers/tests/modeling_bert_test.py
pytorch_transformers/tests/modeling_bert_test.py
+13
-11
pytorch_transformers/tests/modeling_distilbert_test.py
pytorch_transformers/tests/modeling_distilbert_test.py
+2
-2
No files found.
pytorch_transformers/modeling_bert.py
View file @
e25cba78
...
@@ -596,18 +596,18 @@ BERT_INPUTS_DOCSTRING = r"""
...
@@ -596,18 +596,18 @@ BERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Mask to avoid performing attention on padding token indices.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs.
Segment token indices to indicate first and second portions of the inputs.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
corresponds to a `sentence B` token
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Indices of positions of each input sequence tokens in the position embeddings.
Mask values selected in ``[0, 1]``:
Selected in the range ``[0, config.max_position_embeddings - 1]``.
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
...
@@ -668,7 +668,7 @@ class BertModel(BertPreTrainedModel):
...
@@ -668,7 +668,7 @@ class BertModel(BertPreTrainedModel):
for
layer
,
heads
in
heads_to_prune
.
items
():
for
layer
,
heads
in
heads_to_prune
.
items
():
self
.
encoder
.
layer
[
layer
].
attention
.
prune_heads
(
heads
)
self
.
encoder
.
layer
[
layer
].
attention
.
prune_heads
(
heads
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
position_ids
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
if
attention_mask
is
None
:
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones_like
(
input_ids
)
attention_mask
=
torch
.
ones_like
(
input_ids
)
if
token_type_ids
is
None
:
if
token_type_ids
is
None
:
...
@@ -771,10 +771,14 @@ class BertForPreTraining(BertPreTrainedModel):
...
@@ -771,10 +771,14 @@ class BertForPreTraining(BertPreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
cls
.
predictions
.
decoder
,
self
.
_tie_or_clone_weights
(
self
.
cls
.
predictions
.
decoder
,
self
.
bert
.
embeddings
.
word_embeddings
)
self
.
bert
.
embeddings
.
word_embeddings
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
next_sentence_label
=
None
,
position_ids
=
None
,
head_mask
=
None
):
masked_lm_labels
=
None
,
next_sentence_label
=
None
):
outputs
=
self
.
bert
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
sequence_output
,
pooled_output
=
outputs
[:
2
]
sequence_output
,
pooled_output
=
outputs
[:
2
]
prediction_scores
,
seq_relationship_score
=
self
.
cls
(
sequence_output
,
pooled_output
)
prediction_scores
,
seq_relationship_score
=
self
.
cls
(
sequence_output
,
pooled_output
)
...
@@ -839,10 +843,14 @@ class BertForMaskedLM(BertPreTrainedModel):
...
@@ -839,10 +843,14 @@ class BertForMaskedLM(BertPreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
cls
.
predictions
.
decoder
,
self
.
_tie_or_clone_weights
(
self
.
cls
.
predictions
.
decoder
,
self
.
bert
.
embeddings
.
word_embeddings
)
self
.
bert
.
embeddings
.
word_embeddings
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
position_ids
=
None
,
head_mask
=
None
):
masked_lm_labels
=
None
):
outputs
=
self
.
bert
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
prediction_scores
=
self
.
cls
(
sequence_output
)
prediction_scores
=
self
.
cls
(
sequence_output
)
...
@@ -896,10 +904,15 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
...
@@ -896,10 +904,15 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
self
.
init_weights
()
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
next_sentence_label
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
position_ids
=
None
,
head_mask
=
None
):
next_sentence_label
=
None
):
outputs
=
self
.
bert
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
pooled_output
=
outputs
[
1
]
pooled_output
=
outputs
[
1
]
seq_relationship_score
=
self
.
cls
(
pooled_output
)
seq_relationship_score
=
self
.
cls
(
pooled_output
)
...
@@ -957,10 +970,15 @@ class BertForSequenceClassification(BertPreTrainedModel):
...
@@ -957,10 +970,15 @@ class BertForSequenceClassification(BertPreTrainedModel):
self
.
init_weights
()
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
position_ids
=
None
,
head_mask
=
None
,
labels
=
None
):
outputs
=
self
.
bert
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
pooled_output
=
outputs
[
1
]
pooled_output
=
outputs
[
1
]
pooled_output
=
self
.
dropout
(
pooled_output
)
pooled_output
=
self
.
dropout
(
pooled_output
)
...
@@ -983,45 +1001,9 @@ class BertForSequenceClassification(BertPreTrainedModel):
...
@@ -983,45 +1001,9 @@ class BertForSequenceClassification(BertPreTrainedModel):
@
add_start_docstrings
(
"""Bert Model with a multiple choice classification head on top (a linear layer on top of
@
add_start_docstrings
(
"""Bert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """
,
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """
,
BERT_START_DOCSTRING
)
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
class
BertForMultipleChoice
(
BertPreTrainedModel
):
class
BertForMultipleChoice
(
BertPreTrainedModel
):
r
"""
r
"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs:
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
(b) For single sequences:
``tokens: [CLS] the dog is hairy . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0``
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Mask to avoid performing attention on padding token indices.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss.
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
...
@@ -1061,16 +1043,21 @@ class BertForMultipleChoice(BertPreTrainedModel):
...
@@ -1061,16 +1043,21 @@ class BertForMultipleChoice(BertPreTrainedModel):
self
.
init_weights
()
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
label
s
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_id
s
=
None
,
position_ids
=
None
,
head_mask
=
None
):
position_ids
=
None
,
head_mask
=
None
,
labels
=
None
):
num_choices
=
input_ids
.
shape
[
1
]
num_choices
=
input_ids
.
shape
[
1
]
flat_input_ids
=
input_ids
.
view
(
-
1
,
input_ids
.
size
(
-
1
))
input_ids
=
input_ids
.
view
(
-
1
,
input_ids
.
size
(
-
1
))
flat_position_ids
=
position_ids
.
view
(
-
1
,
position_ids
.
size
(
-
1
))
if
position_ids
is
not
None
else
None
attention_mask
=
attention_mask
.
view
(
-
1
,
attention_mask
.
size
(
-
1
))
if
attention_mask
is
not
None
else
None
flat_token_type_ids
=
token_type_ids
.
view
(
-
1
,
token_type_ids
.
size
(
-
1
))
if
token_type_ids
is
not
None
else
None
token_type_ids
=
token_type_ids
.
view
(
-
1
,
token_type_ids
.
size
(
-
1
))
if
token_type_ids
is
not
None
else
None
flat_attention_mask
=
attention_mask
.
view
(
-
1
,
attention_mask
.
size
(
-
1
))
if
attention_mask
is
not
None
else
None
position_ids
=
position_ids
.
view
(
-
1
,
position_ids
.
size
(
-
1
))
if
position_ids
is
not
None
else
None
outputs
=
self
.
bert
(
flat_input_ids
,
position_ids
=
flat_position_ids
,
token_type_ids
=
flat_token_type_ids
,
attention_mask
=
flat_attention_mask
,
head_mask
=
head_mask
)
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
pooled_output
=
outputs
[
1
]
pooled_output
=
outputs
[
1
]
pooled_output
=
self
.
dropout
(
pooled_output
)
pooled_output
=
self
.
dropout
(
pooled_output
)
...
@@ -1129,10 +1116,15 @@ class BertForTokenClassification(BertPreTrainedModel):
...
@@ -1129,10 +1116,15 @@ class BertForTokenClassification(BertPreTrainedModel):
self
.
init_weights
()
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
position_ids
=
None
,
head_mask
=
None
,
labels
=
None
):
outputs
=
self
.
bert
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
sequence_output
=
self
.
dropout
(
sequence_output
)
sequence_output
=
self
.
dropout
(
sequence_output
)
...
@@ -1203,10 +1195,15 @@ class BertForQuestionAnswering(BertPreTrainedModel):
...
@@ -1203,10 +1195,15 @@ class BertForQuestionAnswering(BertPreTrainedModel):
self
.
init_weights
()
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
start_positions
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
end_positions
=
None
,
position_ids
=
None
,
head_mask
=
None
):
start_positions
=
None
,
end_positions
=
None
):
outputs
=
self
.
bert
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
logits
=
self
.
qa_outputs
(
sequence_output
)
logits
=
self
.
qa_outputs
(
sequence_output
)
...
...
pytorch_transformers/modeling_distilbert.py
View file @
e25cba78
...
@@ -585,10 +585,10 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
...
@@ -585,10 +585,10 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
vocab_projector
,
self
.
_tie_or_clone_weights
(
self
.
vocab_projector
,
self
.
distilbert
.
embeddings
.
word_embeddings
)
self
.
distilbert
.
embeddings
.
word_embeddings
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
head_mask
=
None
,
masked_lm_labels
=
None
):
dlbrt_output
=
self
.
distilbert
(
input_ids
=
input_ids
,
dlbrt_output
=
self
.
distilbert
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
head_mask
=
head_mask
)
hidden_states
=
dlbrt_output
[
0
]
# (bs, seq_length, dim)
hidden_states
=
dlbrt_output
[
0
]
# (bs, seq_length, dim)
prediction_logits
=
self
.
vocab_transform
(
hidden_states
)
# (bs, seq_length, dim)
prediction_logits
=
self
.
vocab_transform
(
hidden_states
)
# (bs, seq_length, dim)
prediction_logits
=
gelu
(
prediction_logits
)
# (bs, seq_length, dim)
prediction_logits
=
gelu
(
prediction_logits
)
# (bs, seq_length, dim)
...
@@ -649,10 +649,10 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
...
@@ -649,10 +649,10 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
self
.
init_weights
()
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
labels
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
head_mask
=
None
,
labels
=
None
):
distilbert_output
=
self
.
distilbert
(
input_ids
=
input_ids
,
distilbert_output
=
self
.
distilbert
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
head_mask
=
head_mask
)
hidden_state
=
distilbert_output
[
0
]
# (bs, seq_len, dim)
hidden_state
=
distilbert_output
[
0
]
# (bs, seq_len, dim)
pooled_output
=
hidden_state
[:,
0
]
# (bs, dim)
pooled_output
=
hidden_state
[:,
0
]
# (bs, dim)
pooled_output
=
self
.
pre_classifier
(
pooled_output
)
# (bs, dim)
pooled_output
=
self
.
pre_classifier
(
pooled_output
)
# (bs, dim)
...
@@ -723,10 +723,10 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
...
@@ -723,10 +723,10 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
self
.
init_weights
()
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
head_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
):
distilbert_output
=
self
.
distilbert
(
input_ids
=
input_ids
,
distilbert_output
=
self
.
distilbert
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
head_mask
=
head_mask
)
hidden_states
=
distilbert_output
[
0
]
# (bs, max_query_len, dim)
hidden_states
=
distilbert_output
[
0
]
# (bs, max_query_len, dim)
hidden_states
=
self
.
dropout
(
hidden_states
)
# (bs, max_query_len, dim)
hidden_states
=
self
.
dropout
(
hidden_states
)
# (bs, max_query_len, dim)
...
...
pytorch_transformers/modeling_gpt2.py
View file @
e25cba78
...
@@ -257,7 +257,7 @@ class Attention(nn.Module):
...
@@ -257,7 +257,7 @@ class Attention(nn.Module):
self
.
n_head
=
self
.
n_head
-
len
(
heads
)
self
.
n_head
=
self
.
n_head
-
len
(
heads
)
self
.
pruned_heads
=
self
.
pruned_heads
.
union
(
heads
)
self
.
pruned_heads
=
self
.
pruned_heads
.
union
(
heads
)
def
_attn
(
self
,
q
,
k
,
v
,
head_mask
=
None
):
def
_attn
(
self
,
q
,
k
,
v
,
attention_mask
=
None
,
head_mask
=
None
):
w
=
torch
.
matmul
(
q
,
k
)
w
=
torch
.
matmul
(
q
,
k
)
if
self
.
scale
:
if
self
.
scale
:
w
=
w
/
math
.
sqrt
(
v
.
size
(
-
1
))
w
=
w
/
math
.
sqrt
(
v
.
size
(
-
1
))
...
@@ -265,6 +265,10 @@ class Attention(nn.Module):
...
@@ -265,6 +265,10 @@ class Attention(nn.Module):
b
=
self
.
bias
[:,
:,
ns
-
nd
:
ns
,
:
ns
]
b
=
self
.
bias
[:,
:,
ns
-
nd
:
ns
,
:
ns
]
w
=
w
*
b
-
1e4
*
(
1
-
b
)
w
=
w
*
b
-
1e4
*
(
1
-
b
)
if
attention_mask
is
not
None
:
# Apply the attention mask
w
=
w
+
attention_mask
w
=
nn
.
Softmax
(
dim
=-
1
)(
w
)
w
=
nn
.
Softmax
(
dim
=-
1
)(
w
)
w
=
self
.
attn_dropout
(
w
)
w
=
self
.
attn_dropout
(
w
)
...
@@ -290,7 +294,7 @@ class Attention(nn.Module):
...
@@ -290,7 +294,7 @@ class Attention(nn.Module):
else
:
else
:
return
x
.
permute
(
0
,
2
,
1
,
3
)
# (batch, head, seq_length, head_features)
return
x
.
permute
(
0
,
2
,
1
,
3
)
# (batch, head, seq_length, head_features)
def
forward
(
self
,
x
,
layer_past
=
None
,
head_mask
=
None
):
def
forward
(
self
,
x
,
layer_past
=
None
,
attention_mask
=
None
,
head_mask
=
None
):
x
=
self
.
c_attn
(
x
)
x
=
self
.
c_attn
(
x
)
query
,
key
,
value
=
x
.
split
(
self
.
split_size
,
dim
=
2
)
query
,
key
,
value
=
x
.
split
(
self
.
split_size
,
dim
=
2
)
query
=
self
.
split_heads
(
query
)
query
=
self
.
split_heads
(
query
)
...
@@ -302,7 +306,7 @@ class Attention(nn.Module):
...
@@ -302,7 +306,7 @@ class Attention(nn.Module):
value
=
torch
.
cat
((
past_value
,
value
),
dim
=-
2
)
value
=
torch
.
cat
((
past_value
,
value
),
dim
=-
2
)
present
=
torch
.
stack
((
key
.
transpose
(
-
2
,
-
1
),
value
))
# transpose to have same shapes for stacking
present
=
torch
.
stack
((
key
.
transpose
(
-
2
,
-
1
),
value
))
# transpose to have same shapes for stacking
attn_outputs
=
self
.
_attn
(
query
,
key
,
value
,
head_mask
)
attn_outputs
=
self
.
_attn
(
query
,
key
,
value
,
attention_mask
,
head_mask
)
a
=
attn_outputs
[
0
]
a
=
attn_outputs
[
0
]
a
=
self
.
merge_heads
(
a
)
a
=
self
.
merge_heads
(
a
)
...
@@ -337,8 +341,11 @@ class Block(nn.Module):
...
@@ -337,8 +341,11 @@ class Block(nn.Module):
self
.
ln_2
=
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
ln_2
=
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
mlp
=
MLP
(
4
*
nx
,
config
)
self
.
mlp
=
MLP
(
4
*
nx
,
config
)
def
forward
(
self
,
x
,
layer_past
=
None
,
head_mask
=
None
):
def
forward
(
self
,
x
,
layer_past
=
None
,
attention_mask
=
None
,
head_mask
=
None
):
output_attn
=
self
.
attn
(
self
.
ln_1
(
x
),
layer_past
=
layer_past
,
head_mask
=
head_mask
)
output_attn
=
self
.
attn
(
self
.
ln_1
(
x
),
layer_past
=
layer_past
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
a
=
output_attn
[
0
]
# output_attn: a, present, (attentions)
a
=
output_attn
[
0
]
# output_attn: a, present, (attentions)
x
=
x
+
a
x
=
x
+
a
...
@@ -404,17 +411,21 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
...
@@ -404,17 +411,21 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**past**:
**past**:
list of ``torch.FloatTensor`` (one for each layer):
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding.
(see `past` output below). Can be used to speed up sequential decoding.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
...
@@ -473,7 +484,7 @@ class GPT2Model(GPT2PreTrainedModel):
...
@@ -473,7 +484,7 @@ class GPT2Model(GPT2PreTrainedModel):
for
layer
,
heads
in
heads_to_prune
.
items
():
for
layer
,
heads
in
heads_to_prune
.
items
():
self
.
h
[
layer
].
attn
.
prune_heads
(
heads
)
self
.
h
[
layer
].
attn
.
prune_heads
(
heads
)
def
forward
(
self
,
input_ids
,
p
osi
tion_
ids
=
None
,
token_type_ids
=
None
,
p
ast
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
p
ast
=
None
,
atten
tion_
mask
=
None
,
token_type_ids
=
None
,
p
osition_ids
=
None
,
head_mask
=
None
):
if
past
is
None
:
if
past
is
None
:
past_length
=
0
past_length
=
0
past
=
[
None
]
*
len
(
self
.
h
)
past
=
[
None
]
*
len
(
self
.
h
)
...
@@ -483,6 +494,23 @@ class GPT2Model(GPT2PreTrainedModel):
...
@@ -483,6 +494,23 @@ class GPT2Model(GPT2PreTrainedModel):
position_ids
=
torch
.
arange
(
past_length
,
input_ids
.
size
(
-
1
)
+
past_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
torch
.
arange
(
past_length
,
input_ids
.
size
(
-
1
)
+
past_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
# Attention mask.
if
attention_mask
is
not
None
:
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_mask
=
attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
attention_mask
=
(
1.0
-
attention_mask
)
*
-
10000.0
# Prepare head mask if needed
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# attention_probs has shape bsz x n_heads x N x N
...
@@ -520,7 +548,11 @@ class GPT2Model(GPT2PreTrainedModel):
...
@@ -520,7 +548,11 @@ class GPT2Model(GPT2PreTrainedModel):
if
self
.
output_hidden_states
:
if
self
.
output_hidden_states
:
all_hidden_states
=
all_hidden_states
+
(
hidden_states
.
view
(
*
output_shape
),)
all_hidden_states
=
all_hidden_states
+
(
hidden_states
.
view
(
*
output_shape
),)
outputs
=
block
(
hidden_states
,
layer_past
,
head_mask
[
i
])
outputs
=
block
(
hidden_states
,
past
=
layer_past
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
[
i
])
hidden_states
,
present
=
outputs
[:
2
]
hidden_states
,
present
=
outputs
[:
2
]
presents
=
presents
+
(
present
,)
presents
=
presents
+
(
present
,)
...
@@ -601,9 +633,14 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
...
@@ -601,9 +633,14 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
self
.
transformer
.
wte
)
self
.
transformer
.
wte
)
def
forward
(
self
,
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
labels
=
None
,
past
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
past
=
None
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
transformer_outputs
=
self
.
transformer
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
labels
=
None
):
past
=
past
,
head_mask
=
head_mask
)
transformer_outputs
=
self
.
transformer
(
input_ids
,
past
=
past
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
hidden_states
=
transformer_outputs
[
0
]
hidden_states
=
transformer_outputs
[
0
]
lm_logits
=
self
.
lm_head
(
hidden_states
)
lm_logits
=
self
.
lm_head
(
hidden_states
)
...
@@ -626,33 +663,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
...
@@ -626,33 +663,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings,
The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence).
the classification head takes as input the input of a specified classification token index in the input sequence).
"""
,
GPT2_START_DOCSTRING
)
"""
,
GPT2_START_DOCSTRING
,
GPT2_INPUTS_DOCSTRING
)
class
GPT2DoubleHeadsModel
(
GPT2PreTrainedModel
):
class
GPT2DoubleHeadsModel
(
GPT2PreTrainedModel
):
r
""" Inputs:
r
"""
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Index of the classification token in each input sequence.
Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``.
Selected in the range ``[0, input_ids.size(-1) - 1[``.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**past**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling.
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
...
@@ -725,10 +741,15 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
...
@@ -725,10 +741,15 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
self
.
transformer
.
wte
)
self
.
transformer
.
wte
)
def
forward
(
self
,
input_ids
,
mc_token_ids
=
None
,
lm_labels
=
None
,
mc_labels
=
None
,
token_type_ids
=
None
,
def
forward
(
self
,
input_ids
,
past
=
None
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
position_ids
=
None
,
past
=
None
,
head_mask
=
None
):
mc_token_ids
=
None
,
lm_labels
=
None
,
mc_labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
transformer_outputs
=
self
.
transformer
(
input_ids
,
past
=
past
,
head_mask
=
head_mask
)
past
=
past
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
hidden_states
=
transformer_outputs
[
0
]
hidden_states
=
transformer_outputs
[
0
]
lm_logits
=
self
.
lm_head
(
hidden_states
)
lm_logits
=
self
.
lm_head
(
hidden_states
)
...
...
pytorch_transformers/modeling_openai.py
View file @
e25cba78
...
@@ -270,7 +270,7 @@ class Attention(nn.Module):
...
@@ -270,7 +270,7 @@ class Attention(nn.Module):
self
.
n_head
=
self
.
n_head
-
len
(
heads
)
self
.
n_head
=
self
.
n_head
-
len
(
heads
)
self
.
pruned_heads
=
self
.
pruned_heads
.
union
(
heads
)
self
.
pruned_heads
=
self
.
pruned_heads
.
union
(
heads
)
def
_attn
(
self
,
q
,
k
,
v
,
head_mask
=
None
):
def
_attn
(
self
,
q
,
k
,
v
,
attention_mask
=
None
,
head_mask
=
None
):
w
=
torch
.
matmul
(
q
,
k
)
w
=
torch
.
matmul
(
q
,
k
)
if
self
.
scale
:
if
self
.
scale
:
w
=
w
/
math
.
sqrt
(
v
.
size
(
-
1
))
w
=
w
/
math
.
sqrt
(
v
.
size
(
-
1
))
...
@@ -279,6 +279,10 @@ class Attention(nn.Module):
...
@@ -279,6 +279,10 @@ class Attention(nn.Module):
b
=
self
.
bias
[:,
:,
:
w
.
size
(
-
2
),
:
w
.
size
(
-
1
)]
b
=
self
.
bias
[:,
:,
:
w
.
size
(
-
2
),
:
w
.
size
(
-
1
)]
w
=
w
*
b
+
-
1e9
*
(
1
-
b
)
w
=
w
*
b
+
-
1e9
*
(
1
-
b
)
if
attention_mask
is
not
None
:
# Apply the attention mask
w
=
w
+
attention_mask
w
=
nn
.
Softmax
(
dim
=-
1
)(
w
)
w
=
nn
.
Softmax
(
dim
=-
1
)(
w
)
w
=
self
.
attn_dropout
(
w
)
w
=
self
.
attn_dropout
(
w
)
...
@@ -304,14 +308,14 @@ class Attention(nn.Module):
...
@@ -304,14 +308,14 @@ class Attention(nn.Module):
else
:
else
:
return
x
.
permute
(
0
,
2
,
1
,
3
)
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
forward
(
self
,
x
,
head_mask
=
None
):
def
forward
(
self
,
x
,
attention_mask
=
None
,
head_mask
=
None
):
x
=
self
.
c_attn
(
x
)
x
=
self
.
c_attn
(
x
)
query
,
key
,
value
=
x
.
split
(
self
.
split_size
,
dim
=
2
)
query
,
key
,
value
=
x
.
split
(
self
.
split_size
,
dim
=
2
)
query
=
self
.
split_heads
(
query
)
query
=
self
.
split_heads
(
query
)
key
=
self
.
split_heads
(
key
,
k
=
True
)
key
=
self
.
split_heads
(
key
,
k
=
True
)
value
=
self
.
split_heads
(
value
)
value
=
self
.
split_heads
(
value
)
attn_outputs
=
self
.
_attn
(
query
,
key
,
value
,
head_mask
)
attn_outputs
=
self
.
_attn
(
query
,
key
,
value
,
attention_mask
,
head_mask
)
a
=
attn_outputs
[
0
]
a
=
attn_outputs
[
0
]
a
=
self
.
merge_heads
(
a
)
a
=
self
.
merge_heads
(
a
)
...
@@ -346,8 +350,8 @@ class Block(nn.Module):
...
@@ -346,8 +350,8 @@ class Block(nn.Module):
self
.
mlp
=
MLP
(
4
*
nx
,
config
)
self
.
mlp
=
MLP
(
4
*
nx
,
config
)
self
.
ln_2
=
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
ln_2
=
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
def
forward
(
self
,
x
,
head_mask
=
None
):
def
forward
(
self
,
x
,
attention_mask
=
None
,
head_mask
=
None
):
attn_outputs
=
self
.
attn
(
x
,
head_mask
=
head_mask
)
attn_outputs
=
self
.
attn
(
x
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
a
=
attn_outputs
[
0
]
a
=
attn_outputs
[
0
]
n
=
self
.
ln_1
(
x
+
a
)
n
=
self
.
ln_1
(
x
+
a
)
...
@@ -410,13 +414,17 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
...
@@ -410,13 +414,17 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Mask to avoid performing attention on padding token indices.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
...
@@ -470,7 +478,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
...
@@ -470,7 +478,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
for
layer
,
heads
in
heads_to_prune
.
items
():
for
layer
,
heads
in
heads_to_prune
.
items
():
self
.
h
[
layer
].
attn
.
prune_heads
(
heads
)
self
.
h
[
layer
].
attn
.
prune_heads
(
heads
)
def
forward
(
self
,
input_ids
,
posi
tion_
ids
=
None
,
token_type_ids
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
atten
tion_
mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
if
position_ids
is
None
:
if
position_ids
is
None
:
# This was used when we had a single embedding matrice from position and token embeddings
# This was used when we had a single embedding matrice from position and token embeddings
# start = self.config.vocab_size + self.config.n_special
# start = self.config.vocab_size + self.config.n_special
...
@@ -479,6 +487,23 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
...
@@ -479,6 +487,23 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
position_ids
=
torch
.
arange
(
input_ids
.
size
(
-
1
),
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
torch
.
arange
(
input_ids
.
size
(
-
1
),
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
# Attention mask.
if
attention_mask
is
not
None
:
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_mask
=
attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
attention_mask
=
(
1.0
-
attention_mask
)
*
-
10000.0
# Prepare head mask if needed
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# attention_probs has shape bsz x n_heads x N x N
...
@@ -515,7 +540,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
...
@@ -515,7 +540,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
if
self
.
output_hidden_states
:
if
self
.
output_hidden_states
:
all_hidden_states
=
all_hidden_states
+
(
hidden_states
.
view
(
*
output_shape
),)
all_hidden_states
=
all_hidden_states
+
(
hidden_states
.
view
(
*
output_shape
),)
outputs
=
block
(
hidden_states
,
head_mask
[
i
])
outputs
=
block
(
hidden_states
,
attention_mask
,
head_mask
[
i
])
hidden_states
=
outputs
[
0
]
hidden_states
=
outputs
[
0
]
if
self
.
output_attentions
:
if
self
.
output_attentions
:
all_attentions
=
all_attentions
+
(
outputs
[
1
],)
all_attentions
=
all_attentions
+
(
outputs
[
1
],)
...
@@ -580,8 +605,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
...
@@ -580,8 +605,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
self
.
transformer
.
tokens_embed
)
self
.
transformer
.
tokens_embed
)
def
forward
(
self
,
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
labels
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
transformer_outputs
=
self
.
transformer
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
head_mask
=
head_mask
)
hidden_states
=
transformer_outputs
[
0
]
hidden_states
=
transformer_outputs
[
0
]
lm_logits
=
self
.
lm_head
(
hidden_states
)
lm_logits
=
self
.
lm_head
(
hidden_states
)
...
@@ -604,29 +633,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
...
@@ -604,29 +633,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings,
The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence).
the classification head takes as input the input of a specified classification token index in the input sequence).
"""
,
OPENAI_GPT_START_DOCSTRING
)
"""
,
OPENAI_GPT_START_DOCSTRING
,
OPENAI_GPT_INPUTS_DOCSTRING
)
class
OpenAIGPTDoubleHeadsModel
(
OpenAIGPTPreTrainedModel
):
class
OpenAIGPTDoubleHeadsModel
(
OpenAIGPTPreTrainedModel
):
r
""" Inputs:
r
"""
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Index of the classification token in each input sequence.
Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``.
Selected in the range ``[0, input_ids.size(-1) - 1[``.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling.
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
...
@@ -687,9 +699,12 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
...
@@ -687,9 +699,12 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
self
.
transformer
.
tokens_embed
)
self
.
transformer
.
tokens_embed
)
def
forward
(
self
,
input_ids
,
mc_token_ids
=
None
,
lm_labels
=
None
,
mc_labels
=
None
,
token_type_ids
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
position_ids
=
None
,
head_mask
=
None
):
lm_labels
=
None
,
mc_labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
transformer_outputs
=
self
.
transformer
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
head_mask
=
head_mask
)
hidden_states
=
transformer_outputs
[
0
]
hidden_states
=
transformer_outputs
[
0
]
...
...
pytorch_transformers/modeling_roberta.py
View file @
e25cba78
...
@@ -61,7 +61,9 @@ class RobertaEmbeddings(BertEmbeddings):
...
@@ -61,7 +61,9 @@ class RobertaEmbeddings(BertEmbeddings):
# cf. fairseq's `utils.make_positions`
# cf. fairseq's `utils.make_positions`
position_ids
=
torch
.
arange
(
self
.
padding_idx
+
1
,
seq_length
+
self
.
padding_idx
+
1
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
torch
.
arange
(
self
.
padding_idx
+
1
,
seq_length
+
self
.
padding_idx
+
1
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
return
super
(
RobertaEmbeddings
,
self
).
forward
(
input_ids
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
)
return
super
(
RobertaEmbeddings
,
self
).
forward
(
input_ids
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
)
class
RobertaConfig
(
BertConfig
):
class
RobertaConfig
(
BertConfig
):
...
@@ -116,13 +118,20 @@ ROBERTA_INPUTS_DOCSTRING = r"""
...
@@ -116,13 +118,20 @@ ROBERTA_INPUTS_DOCSTRING = r"""
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1[``.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Optional segment token indices to indicate first and second portions of the inputs.
This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
during finetuning.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1[``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
...
@@ -170,12 +179,16 @@ class RobertaModel(BertModel):
...
@@ -170,12 +179,16 @@ class RobertaModel(BertModel):
self
.
embeddings
=
RobertaEmbeddings
(
config
)
self
.
embeddings
=
RobertaEmbeddings
(
config
)
self
.
init_weights
()
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
position_ids
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
if
input_ids
[:,
0
].
sum
().
item
()
!=
0
:
if
input_ids
[:,
0
].
sum
().
item
()
!=
0
:
logger
.
warning
(
"A sequence with no special tokens has been passed to the RoBERTa model. "
logger
.
warning
(
"A sequence with no special tokens has been passed to the RoBERTa model. "
"This model requires special tokens in order to work. "
"This model requires special tokens in order to work. "
"Please specify add_special_tokens=True in your encoding."
)
"Please specify add_special_tokens=True in your encoding."
)
return
super
(
RobertaModel
,
self
).
forward
(
input_ids
,
token_type_ids
,
attention_mask
,
position_ids
,
head_mask
)
return
super
(
RobertaModel
,
self
).
forward
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
@
add_start_docstrings
(
"""RoBERTa Model with a `language modeling` head on top. """
,
@
add_start_docstrings
(
"""RoBERTa Model with a `language modeling` head on top. """
,
...
@@ -229,10 +242,13 @@ class RobertaForMaskedLM(BertPreTrainedModel):
...
@@ -229,10 +242,13 @@ class RobertaForMaskedLM(BertPreTrainedModel):
"""
"""
self
.
_tie_or_clone_weights
(
self
.
lm_head
.
decoder
,
self
.
roberta
.
embeddings
.
word_embeddings
)
self
.
_tie_or_clone_weights
(
self
.
lm_head
.
decoder
,
self
.
roberta
.
embeddings
.
word_embeddings
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
position_ids
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
head_mask
=
None
):
masked_lm_labels
=
None
):
outputs
=
self
.
roberta
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
outputs
=
self
.
roberta
(
input_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
prediction_scores
=
self
.
lm_head
(
sequence_output
)
prediction_scores
=
self
.
lm_head
(
sequence_output
)
...
@@ -313,10 +329,13 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
...
@@ -313,10 +329,13 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
self
.
roberta
=
RobertaModel
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
self
.
classifier
=
RobertaClassificationHead
(
config
)
self
.
classifier
=
RobertaClassificationHead
(
config
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
position_ids
=
None
,
head_mask
=
None
):
labels
=
None
):
outputs
=
self
.
roberta
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
outputs
=
self
.
roberta
(
input_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
logits
=
self
.
classifier
(
sequence_output
)
logits
=
self
.
classifier
(
sequence_output
)
...
...
pytorch_transformers/modeling_transfo_xl.py
View file @
e25cba78
...
@@ -1342,7 +1342,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
...
@@ -1342,7 +1342,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
def
init_mems
(
self
,
data
):
def
init_mems
(
self
,
data
):
return
self
.
transformer
.
init_mems
(
data
)
return
self
.
transformer
.
init_mems
(
data
)
def
forward
(
self
,
input_ids
,
labels
=
None
,
mems
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
mems
=
None
,
head_mask
=
None
,
labels
=
None
):
bsz
=
input_ids
.
size
(
0
)
bsz
=
input_ids
.
size
(
0
)
tgt_len
=
input_ids
.
size
(
1
)
tgt_len
=
input_ids
.
size
(
1
)
...
...
pytorch_transformers/modeling_xlm.py
View file @
e25cba78
...
@@ -441,23 +441,23 @@ XLM_INPUTS_DOCSTRING = r"""
...
@@ -441,23 +441,23 @@ XLM_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Mask to avoid performing attention on padding token indices.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
Mask values selected in ``[0, 1]``:
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens to be used to indicate the language of each token in the input.
A parallel sequence of tokens to be used to indicate the language of each token in the input.
Indices are languages ids which can be obtained from the language names by using two conversion mappings
Indices are languages ids which can be obtained from the language names by using two conversion mappings
provided in the configuration of the model (only provided for multilingual models).
provided in the configuration of the model (only provided for multilingual models).
More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
Mask values selected in ``[0, 1]``:
The embeddings from these tokens will be summed with the respective token embeddings.
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
**lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Length of each sentence that can be used to avoid performing attention on padding token indices.
Length of each sentence that can be used to avoid performing attention on padding token indices.
You can also use `attention_mask` for the same result (see above), kept here for compatbility.
You can also use `attention_mask` for the same result (see above), kept here for compatbility.
...
@@ -584,8 +584,8 @@ class XLMModel(XLMPreTrainedModel):
...
@@ -584,8 +584,8 @@ class XLMModel(XLMPreTrainedModel):
for
layer
,
heads
in
heads_to_prune
.
items
():
for
layer
,
heads
in
heads_to_prune
.
items
():
self
.
attentions
[
layer
].
prune_heads
(
heads
)
self
.
attentions
[
layer
].
prune_heads
(
heads
)
def
forward
(
self
,
input_ids
,
lengths
=
None
,
position_ids
=
None
,
lang
s
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
langs
=
None
,
token_type_ids
=
None
,
position_id
s
=
None
,
token_type_ids
=
None
,
attention_mask
=
None
,
cache
=
None
,
head_mask
=
None
):
# src_enc=None, src_len=None
,
lengths
=
None
,
cache
=
None
,
head_mask
=
None
):
#
removed:
src_enc=None, src_len=None
if
lengths
is
None
:
if
lengths
is
None
:
lengths
=
(
input_ids
!=
self
.
pad_index
).
sum
(
dim
=
1
).
long
()
lengths
=
(
input_ids
!=
self
.
pad_index
).
sum
(
dim
=
1
).
long
()
# mask = input_ids != self.pad_index
# mask = input_ids != self.pad_index
...
@@ -790,11 +790,16 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
...
@@ -790,11 +790,16 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
"""
"""
self
.
_tie_or_clone_weights
(
self
.
pred_layer
.
proj
,
self
.
transformer
.
embeddings
)
self
.
_tie_or_clone_weights
(
self
.
pred_layer
.
proj
,
self
.
transformer
.
embeddings
)
def
forward
(
self
,
input_ids
,
lengths
=
None
,
position_ids
=
None
,
langs
=
None
,
token_type_ids
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
langs
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
attention_mask
=
None
,
cache
=
None
,
labels
=
None
,
head_mask
=
None
):
lengths
=
None
,
cache
=
None
,
head_mask
=
None
,
labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
lengths
=
lengths
,
position_ids
=
position_ids
,
transformer_outputs
=
self
.
transformer
(
input_ids
,
token_type_ids
=
token_type_ids
,
langs
=
langs
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
cache
=
cache
,
head_mask
=
head_mask
)
langs
=
langs
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
lengths
=
lengths
,
cache
=
cache
,
head_mask
=
head_mask
)
output
=
transformer_outputs
[
0
]
output
=
transformer_outputs
[
0
]
outputs
=
self
.
pred_layer
(
output
,
labels
)
outputs
=
self
.
pred_layer
(
output
,
labels
)
...
@@ -846,11 +851,16 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
...
@@ -846,11 +851,16 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
self
.
init_weights
()
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
lengths
=
None
,
position_ids
=
None
,
langs
=
None
,
token_type_ids
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
langs
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
attention_mask
=
None
,
cache
=
None
,
labels
=
None
,
head_mask
=
None
):
lengths
=
None
,
cache
=
None
,
head_mask
=
None
,
labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
lengths
=
lengths
,
position_ids
=
position_ids
,
transformer_outputs
=
self
.
transformer
(
input_ids
,
token_type_ids
=
token_type_ids
,
langs
=
langs
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
cache
=
cache
,
head_mask
=
head_mask
)
langs
=
langs
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
lengths
=
lengths
,
cache
=
cache
,
head_mask
=
head_mask
)
output
=
transformer_outputs
[
0
]
output
=
transformer_outputs
[
0
]
logits
=
self
.
sequence_summary
(
output
)
logits
=
self
.
sequence_summary
(
output
)
...
@@ -924,12 +934,17 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
...
@@ -924,12 +934,17 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
self
.
init_weights
()
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
lengths
=
None
,
position_ids
=
None
,
langs
=
None
,
token_type_ids
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
langs
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
attention_mask
=
None
,
cache
=
None
,
start_positions
=
None
,
end_positions
=
None
,
lengths
=
None
,
cache
=
None
,
head_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
,
cls_index
=
None
,
is_impossible
=
None
,
p_mask
=
None
,
head_mask
=
None
):
is_impossible
=
None
,
cls_index
=
None
,
p_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
lengths
=
lengths
,
position_ids
=
position_ids
,
transformer_outputs
=
self
.
transformer
(
input_ids
,
token_type_ids
=
token_type_ids
,
langs
=
langs
,
attention_mask
=
attention_mask
,
attention_mask
=
attention_mask
,
cache
=
cache
,
head_mask
=
head_mask
)
langs
=
langs
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
lengths
=
lengths
,
cache
=
cache
,
head_mask
=
head_mask
)
output
=
transformer_outputs
[
0
]
output
=
transformer_outputs
[
0
]
...
...
pytorch_transformers/modeling_xlnet.py
View file @
e25cba78
...
@@ -647,21 +647,10 @@ XLNET_INPUTS_DOCSTRING = r"""
...
@@ -647,21 +647,10 @@ XLNET_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
Kept for compatibility with the original code base.
You can only uses one of `input_mask` and `attention_mask`
Mask values selected in ``[0, 1]``:
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
**mems**: (`optional`)
**mems**: (`optional`)
list of ``torch.FloatTensor`` (one for each layer):
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
...
@@ -679,6 +668,17 @@ XLNET_INPUTS_DOCSTRING = r"""
...
@@ -679,6 +668,17 @@ XLNET_INPUTS_DOCSTRING = r"""
Mask to indicate the output tokens to use.
Mask to indicate the output tokens to use.
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
Only used during pretraining for partial prediction or for sequential decoding (generation).
Only used during pretraining for partial prediction or for sequential decoding (generation).
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
Kept for compatibility with the original code base.
You can only uses one of `input_mask` and `attention_mask`
Mask values selected in ``[0, 1]``:
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
...
@@ -837,8 +837,8 @@ class XLNetModel(XLNetPreTrainedModel):
...
@@ -837,8 +837,8 @@ class XLNetModel(XLNetPreTrainedModel):
pos_emb
=
pos_emb
.
to
(
next
(
self
.
parameters
()))
pos_emb
=
pos_emb
.
to
(
next
(
self
.
parameters
()))
return
pos_emb
return
pos_emb
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
input_mask
=
None
,
attention_mask
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
head_mask
=
None
):
token_type_ids
=
None
,
input_mask
=
None
,
head_mask
=
None
):
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension
# but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end
# so we move here the first dimension (batch) to the end
...
@@ -1042,12 +1042,15 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
...
@@ -1042,12 +1042,15 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
"""
"""
self
.
_tie_or_clone_weights
(
self
.
lm_loss
,
self
.
transformer
.
word_embedding
)
self
.
_tie_or_clone_weights
(
self
.
lm_loss
,
self
.
transformer
.
word_embedding
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
input_mask
=
None
,
attention_mask
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
token_type_ids
=
None
,
input_mask
=
None
,
head_mask
=
None
,
labels
=
None
):
labels
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
transformer_outputs
=
self
.
transformer
(
input_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
input_mask
=
input_mask
,
attention_mask
=
attention_mask
,
mems
=
mems
,
mems
=
mems
,
perm_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
perm_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
token_type_ids
=
token_type_ids
,
input_mask
=
input_mask
,
head_mask
=
head_mask
)
head_mask
=
head_mask
)
logits
=
self
.
lm_loss
(
transformer_outputs
[
0
])
logits
=
self
.
lm_loss
(
transformer_outputs
[
0
])
...
@@ -1113,12 +1116,15 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
...
@@ -1113,12 +1116,15 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
self
.
init_weights
()
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
input_mask
=
None
,
attention_mask
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
token_type_ids
=
None
,
input_mask
=
None
,
head_mask
=
None
,
labels
=
None
):
labels
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
transformer_outputs
=
self
.
transformer
(
input_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
input_mask
=
input_mask
,
attention_mask
=
attention_mask
,
mems
=
mems
,
mems
=
mems
,
perm_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
perm_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
token_type_ids
=
token_type_ids
,
input_mask
=
input_mask
,
head_mask
=
head_mask
)
head_mask
=
head_mask
)
output
=
transformer_outputs
[
0
]
output
=
transformer_outputs
[
0
]
...
@@ -1215,13 +1221,16 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
...
@@ -1215,13 +1221,16 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
self
.
init_weights
()
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
input_mask
=
None
,
attention_mask
=
None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
token_type_ids
=
None
,
input_mask
=
None
,
head_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
,
cls_index
=
None
,
is_impossible
=
None
,
p_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
,
is_impossible
=
None
,
cls_index
=
None
,
p_mask
=
None
,):
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
transformer_outputs
=
self
.
transformer
(
input_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
input_mask
=
input_mask
,
attention_mask
=
attention_mask
,
mems
=
mems
,
mems
=
mems
,
perm_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
perm_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
token_type_ids
=
token_type_ids
,
input_mask
=
input_mask
,
head_mask
=
head_mask
)
head_mask
=
head_mask
)
hidden_states
=
transformer_outputs
[
0
]
hidden_states
=
transformer_outputs
[
0
]
start_logits
=
self
.
start_logits
(
hidden_states
,
p_mask
=
p_mask
)
start_logits
=
self
.
start_logits
(
hidden_states
,
p_mask
=
p_mask
)
...
...
pytorch_transformers/tests/modeling_bert_test.py
View file @
e25cba78
...
@@ -126,8 +126,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -126,8 +126,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertModel
(
config
=
config
)
model
=
BertModel
(
config
=
config
)
model
.
eval
()
model
.
eval
()
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
,
input_mask
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
)
result
=
{
result
=
{
...
@@ -143,7 +143,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -143,7 +143,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_masked_lm
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_for_masked_lm
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForMaskedLM
(
config
=
config
)
model
=
BertForMaskedLM
(
config
=
config
)
model
.
eval
()
model
.
eval
()
loss
,
prediction_scores
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
token_labels
)
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
)
result
=
{
result
=
{
"loss"
:
loss
,
"loss"
:
loss
,
"prediction_scores"
:
prediction_scores
,
"prediction_scores"
:
prediction_scores
,
...
@@ -156,7 +156,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -156,7 +156,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_next_sequence_prediction
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_for_next_sequence_prediction
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForNextSentencePrediction
(
config
=
config
)
model
=
BertForNextSentencePrediction
(
config
=
config
)
model
.
eval
()
model
.
eval
()
loss
,
seq_relationship_score
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
)
loss
,
seq_relationship_score
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
next_sentence_label
=
sequence_labels
)
result
=
{
result
=
{
"loss"
:
loss
,
"loss"
:
loss
,
"seq_relationship_score"
:
seq_relationship_score
,
"seq_relationship_score"
:
seq_relationship_score
,
...
@@ -170,7 +170,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -170,7 +170,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_pretraining
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_for_pretraining
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForPreTraining
(
config
=
config
)
model
=
BertForPreTraining
(
config
=
config
)
model
.
eval
()
model
.
eval
()
loss
,
prediction_scores
,
seq_relationship_score
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
token_labels
,
sequence_labels
)
loss
,
prediction_scores
,
seq_relationship_score
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
,
next_sentence_label
=
sequence_labels
)
result
=
{
result
=
{
"loss"
:
loss
,
"loss"
:
loss
,
"prediction_scores"
:
prediction_scores
,
"prediction_scores"
:
prediction_scores
,
...
@@ -188,7 +189,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -188,7 +189,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_question_answering
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_bert_for_question_answering
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForQuestionAnswering
(
config
=
config
)
model
=
BertForQuestionAnswering
(
config
=
config
)
model
.
eval
()
model
.
eval
()
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
sequence_labels
)
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
start_positions
=
sequence_labels
,
end_positions
=
sequence_labels
)
result
=
{
result
=
{
"loss"
:
loss
,
"loss"
:
loss
,
"start_logits"
:
start_logits
,
"start_logits"
:
start_logits
,
...
@@ -207,7 +209,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -207,7 +209,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
config
.
num_labels
=
self
.
num_labels
config
.
num_labels
=
self
.
num_labels
model
=
BertForSequenceClassification
(
config
)
model
=
BertForSequenceClassification
(
config
)
model
.
eval
()
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
)
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
labels
=
sequence_labels
)
result
=
{
result
=
{
"loss"
:
loss
,
"loss"
:
loss
,
"logits"
:
logits
,
"logits"
:
logits
,
...
@@ -222,7 +224,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -222,7 +224,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
config
.
num_labels
=
self
.
num_labels
config
.
num_labels
=
self
.
num_labels
model
=
BertForTokenClassification
(
config
=
config
)
model
=
BertForTokenClassification
(
config
=
config
)
model
.
eval
()
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
token_labels
)
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
labels
=
token_labels
)
result
=
{
result
=
{
"loss"
:
loss
,
"loss"
:
loss
,
"logits"
:
logits
,
"logits"
:
logits
,
...
@@ -241,9 +243,9 @@ class BertModelTest(CommonTestCases.CommonModelTester):
...
@@ -241,9 +243,9 @@ class BertModelTest(CommonTestCases.CommonModelTester):
multiple_choice_token_type_ids
=
token_type_ids
.
unsqueeze
(
1
).
expand
(
-
1
,
self
.
num_choices
,
-
1
).
contiguous
()
multiple_choice_token_type_ids
=
token_type_ids
.
unsqueeze
(
1
).
expand
(
-
1
,
self
.
num_choices
,
-
1
).
contiguous
()
multiple_choice_input_mask
=
input_mask
.
unsqueeze
(
1
).
expand
(
-
1
,
self
.
num_choices
,
-
1
).
contiguous
()
multiple_choice_input_mask
=
input_mask
.
unsqueeze
(
1
).
expand
(
-
1
,
self
.
num_choices
,
-
1
).
contiguous
()
loss
,
logits
=
model
(
multiple_choice_inputs_ids
,
loss
,
logits
=
model
(
multiple_choice_inputs_ids
,
multiple_choice_
token_type_ids
,
attention_mask
=
multiple_choice_
input_mask
,
multiple_choice_
input_mask
,
token_type_ids
=
multiple_choice_
token_type_ids
,
choice_labels
)
labels
=
choice_labels
)
result
=
{
result
=
{
"loss"
:
loss
,
"loss"
:
loss
,
"logits"
:
logits
,
"logits"
:
logits
,
...
...
pytorch_transformers/tests/modeling_distilbert_test.py
View file @
e25cba78
...
@@ -148,7 +148,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
...
@@ -148,7 +148,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_distilbert_for_question_answering
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
def
create_and_check_distilbert_for_question_answering
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
DistilBertForQuestionAnswering
(
config
=
config
)
model
=
DistilBertForQuestionAnswering
(
config
=
config
)
model
.
eval
()
model
.
eval
()
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
input_mask
,
sequence_labels
,
sequence_labels
)
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
start_positions
=
sequence_labels
,
end_positions
=
sequence_labels
)
result
=
{
result
=
{
"loss"
:
loss
,
"loss"
:
loss
,
"start_logits"
:
start_logits
,
"start_logits"
:
start_logits
,
...
@@ -166,7 +166,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
...
@@ -166,7 +166,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
config
.
num_labels
=
self
.
num_labels
config
.
num_labels
=
self
.
num_labels
model
=
DistilBertForSequenceClassification
(
config
)
model
=
DistilBertForSequenceClassification
(
config
)
model
.
eval
()
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
input_mask
,
sequence_labels
)
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
labels
=
sequence_labels
)
result
=
{
result
=
{
"loss"
:
loss
,
"loss"
:
loss
,
"logits"
:
logits
,
"logits"
:
logits
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment