Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e25cba78
Commit
e25cba78
authored
Sep 04, 2019
by
thomwolf
Browse files
WIP reodering arguments for torchscript and TF
parent
38b79b5a
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
331 additions
and
253 deletions
+331
-253
pytorch_transformers/modeling_bert.py
pytorch_transformers/modeling_bert.py
+74
-77
pytorch_transformers/modeling_distilbert.py
pytorch_transformers/modeling_distilbert.py
+9
-9
pytorch_transformers/modeling_gpt2.py
pytorch_transformers/modeling_gpt2.py
+65
-44
pytorch_transformers/modeling_openai.py
pytorch_transformers/modeling_openai.py
+49
-34
pytorch_transformers/modeling_roberta.py
pytorch_transformers/modeling_roberta.py
+33
-14
pytorch_transformers/modeling_transfo_xl.py
pytorch_transformers/modeling_transfo_xl.py
+1
-1
pytorch_transformers/modeling_xlm.py
pytorch_transformers/modeling_xlm.py
+44
-29
pytorch_transformers/modeling_xlnet.py
pytorch_transformers/modeling_xlnet.py
+41
-32
pytorch_transformers/tests/modeling_bert_test.py
pytorch_transformers/tests/modeling_bert_test.py
+13
-11
pytorch_transformers/tests/modeling_distilbert_test.py
pytorch_transformers/tests/modeling_distilbert_test.py
+2
-2
No files found.
pytorch_transformers/modeling_bert.py
View file @
e25cba78
...
...
@@ -596,18 +596,18 @@ BERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
...
...
@@ -668,7 +668,7 @@ class BertModel(BertPreTrainedModel):
for
layer
,
heads
in
heads_to_prune
.
items
():
self
.
encoder
.
layer
[
layer
].
attention
.
prune_heads
(
heads
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
position_ids
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones_like
(
input_ids
)
if
token_type_ids
is
None
:
...
...
@@ -771,10 +771,14 @@ class BertForPreTraining(BertPreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
cls
.
predictions
.
decoder
,
self
.
bert
.
embeddings
.
word_embeddings
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
next_sentence_label
=
None
,
position_ids
=
None
,
head_mask
=
None
):
outputs
=
self
.
bert
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
masked_lm_labels
=
None
,
next_sentence_label
=
None
):
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
sequence_output
,
pooled_output
=
outputs
[:
2
]
prediction_scores
,
seq_relationship_score
=
self
.
cls
(
sequence_output
,
pooled_output
)
...
...
@@ -839,10 +843,14 @@ class BertForMaskedLM(BertPreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
cls
.
predictions
.
decoder
,
self
.
bert
.
embeddings
.
word_embeddings
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
position_ids
=
None
,
head_mask
=
None
):
outputs
=
self
.
bert
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
masked_lm_labels
=
None
):
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
sequence_output
=
outputs
[
0
]
prediction_scores
=
self
.
cls
(
sequence_output
)
...
...
@@ -896,10 +904,15 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
next_sentence_label
=
None
,
position_ids
=
None
,
head_mask
=
None
):
outputs
=
self
.
bert
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
next_sentence_label
=
None
):
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
pooled_output
=
outputs
[
1
]
seq_relationship_score
=
self
.
cls
(
pooled_output
)
...
...
@@ -957,10 +970,15 @@ class BertForSequenceClassification(BertPreTrainedModel):
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
position_ids
=
None
,
head_mask
=
None
):
outputs
=
self
.
bert
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
labels
=
None
):
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
pooled_output
=
outputs
[
1
]
pooled_output
=
self
.
dropout
(
pooled_output
)
...
...
@@ -983,45 +1001,9 @@ class BertForSequenceClassification(BertPreTrainedModel):
@
add_start_docstrings
(
"""Bert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """
,
BERT_START_DOCSTRING
)
BERT_START_DOCSTRING
,
BERT_INPUTS_DOCSTRING
)
class
BertForMultipleChoice
(
BertPreTrainedModel
):
r
"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs:
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
(b) For single sequences:
``tokens: [CLS] the dog is hairy . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0``
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Mask to avoid performing attention on padding token indices.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
...
...
@@ -1061,16 +1043,21 @@ class BertForMultipleChoice(BertPreTrainedModel):
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
label
s
=
None
,
position_ids
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_id
s
=
None
,
position_ids
=
None
,
head_mask
=
None
,
labels
=
None
):
num_choices
=
input_ids
.
shape
[
1
]
flat_input_ids
=
input_ids
.
view
(
-
1
,
input_ids
.
size
(
-
1
))
flat_position_ids
=
position_ids
.
view
(
-
1
,
position_ids
.
size
(
-
1
))
if
position_ids
is
not
None
else
None
flat_token_type_ids
=
token_type_ids
.
view
(
-
1
,
token_type_ids
.
size
(
-
1
))
if
token_type_ids
is
not
None
else
None
flat_attention_mask
=
attention_mask
.
view
(
-
1
,
attention_mask
.
size
(
-
1
))
if
attention_mask
is
not
None
else
None
outputs
=
self
.
bert
(
flat_input_ids
,
position_ids
=
flat_position_ids
,
token_type_ids
=
flat_token_type_ids
,
attention_mask
=
flat_attention_mask
,
head_mask
=
head_mask
)
input_ids
=
input_ids
.
view
(
-
1
,
input_ids
.
size
(
-
1
))
attention_mask
=
attention_mask
.
view
(
-
1
,
attention_mask
.
size
(
-
1
))
if
attention_mask
is
not
None
else
None
token_type_ids
=
token_type_ids
.
view
(
-
1
,
token_type_ids
.
size
(
-
1
))
if
token_type_ids
is
not
None
else
None
position_ids
=
position_ids
.
view
(
-
1
,
position_ids
.
size
(
-
1
))
if
position_ids
is
not
None
else
None
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
pooled_output
=
outputs
[
1
]
pooled_output
=
self
.
dropout
(
pooled_output
)
...
...
@@ -1129,10 +1116,15 @@ class BertForTokenClassification(BertPreTrainedModel):
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
position_ids
=
None
,
head_mask
=
None
):
outputs
=
self
.
bert
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
labels
=
None
):
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
sequence_output
=
outputs
[
0
]
sequence_output
=
self
.
dropout
(
sequence_output
)
...
...
@@ -1203,10 +1195,15 @@ class BertForQuestionAnswering(BertPreTrainedModel):
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
,
position_ids
=
None
,
head_mask
=
None
):
outputs
=
self
.
bert
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
):
outputs
=
self
.
bert
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
sequence_output
=
outputs
[
0
]
logits
=
self
.
qa_outputs
(
sequence_output
)
...
...
pytorch_transformers/modeling_distilbert.py
View file @
e25cba78
...
...
@@ -585,10 +585,10 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
vocab_projector
,
self
.
distilbert
.
embeddings
.
word_embeddings
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
head_mask
=
None
,
masked_lm_labels
=
None
):
dlbrt_output
=
self
.
distilbert
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
hidden_states
=
dlbrt_output
[
0
]
# (bs, seq_length, dim)
prediction_logits
=
self
.
vocab_transform
(
hidden_states
)
# (bs, seq_length, dim)
prediction_logits
=
gelu
(
prediction_logits
)
# (bs, seq_length, dim)
...
...
@@ -649,10 +649,10 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
labels
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
head_mask
=
None
,
labels
=
None
):
distilbert_output
=
self
.
distilbert
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
hidden_state
=
distilbert_output
[
0
]
# (bs, seq_len, dim)
pooled_output
=
hidden_state
[:,
0
]
# (bs, dim)
pooled_output
=
self
.
pre_classifier
(
pooled_output
)
# (bs, dim)
...
...
@@ -723,10 +723,10 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
head_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
):
distilbert_output
=
self
.
distilbert
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
hidden_states
=
distilbert_output
[
0
]
# (bs, max_query_len, dim)
hidden_states
=
self
.
dropout
(
hidden_states
)
# (bs, max_query_len, dim)
...
...
pytorch_transformers/modeling_gpt2.py
View file @
e25cba78
...
...
@@ -257,7 +257,7 @@ class Attention(nn.Module):
self
.
n_head
=
self
.
n_head
-
len
(
heads
)
self
.
pruned_heads
=
self
.
pruned_heads
.
union
(
heads
)
def
_attn
(
self
,
q
,
k
,
v
,
head_mask
=
None
):
def
_attn
(
self
,
q
,
k
,
v
,
attention_mask
=
None
,
head_mask
=
None
):
w
=
torch
.
matmul
(
q
,
k
)
if
self
.
scale
:
w
=
w
/
math
.
sqrt
(
v
.
size
(
-
1
))
...
...
@@ -265,6 +265,10 @@ class Attention(nn.Module):
b
=
self
.
bias
[:,
:,
ns
-
nd
:
ns
,
:
ns
]
w
=
w
*
b
-
1e4
*
(
1
-
b
)
if
attention_mask
is
not
None
:
# Apply the attention mask
w
=
w
+
attention_mask
w
=
nn
.
Softmax
(
dim
=-
1
)(
w
)
w
=
self
.
attn_dropout
(
w
)
...
...
@@ -290,7 +294,7 @@ class Attention(nn.Module):
else
:
return
x
.
permute
(
0
,
2
,
1
,
3
)
# (batch, head, seq_length, head_features)
def
forward
(
self
,
x
,
layer_past
=
None
,
head_mask
=
None
):
def
forward
(
self
,
x
,
layer_past
=
None
,
attention_mask
=
None
,
head_mask
=
None
):
x
=
self
.
c_attn
(
x
)
query
,
key
,
value
=
x
.
split
(
self
.
split_size
,
dim
=
2
)
query
=
self
.
split_heads
(
query
)
...
...
@@ -302,7 +306,7 @@ class Attention(nn.Module):
value
=
torch
.
cat
((
past_value
,
value
),
dim
=-
2
)
present
=
torch
.
stack
((
key
.
transpose
(
-
2
,
-
1
),
value
))
# transpose to have same shapes for stacking
attn_outputs
=
self
.
_attn
(
query
,
key
,
value
,
head_mask
)
attn_outputs
=
self
.
_attn
(
query
,
key
,
value
,
attention_mask
,
head_mask
)
a
=
attn_outputs
[
0
]
a
=
self
.
merge_heads
(
a
)
...
...
@@ -337,8 +341,11 @@ class Block(nn.Module):
self
.
ln_2
=
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
mlp
=
MLP
(
4
*
nx
,
config
)
def
forward
(
self
,
x
,
layer_past
=
None
,
head_mask
=
None
):
output_attn
=
self
.
attn
(
self
.
ln_1
(
x
),
layer_past
=
layer_past
,
head_mask
=
head_mask
)
def
forward
(
self
,
x
,
layer_past
=
None
,
attention_mask
=
None
,
head_mask
=
None
):
output_attn
=
self
.
attn
(
self
.
ln_1
(
x
),
layer_past
=
layer_past
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
a
=
output_attn
[
0
]
# output_attn: a, present, (attentions)
x
=
x
+
a
...
...
@@ -404,17 +411,21 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**past**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
...
...
@@ -473,7 +484,7 @@ class GPT2Model(GPT2PreTrainedModel):
for
layer
,
heads
in
heads_to_prune
.
items
():
self
.
h
[
layer
].
attn
.
prune_heads
(
heads
)
def
forward
(
self
,
input_ids
,
p
osi
tion_
ids
=
None
,
token_type_ids
=
None
,
p
ast
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
p
ast
=
None
,
atten
tion_
mask
=
None
,
token_type_ids
=
None
,
p
osition_ids
=
None
,
head_mask
=
None
):
if
past
is
None
:
past_length
=
0
past
=
[
None
]
*
len
(
self
.
h
)
...
...
@@ -483,6 +494,23 @@ class GPT2Model(GPT2PreTrainedModel):
position_ids
=
torch
.
arange
(
past_length
,
input_ids
.
size
(
-
1
)
+
past_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
# Attention mask.
if
attention_mask
is
not
None
:
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_mask
=
attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
attention_mask
=
(
1.0
-
attention_mask
)
*
-
10000.0
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
...
...
@@ -520,7 +548,11 @@ class GPT2Model(GPT2PreTrainedModel):
if
self
.
output_hidden_states
:
all_hidden_states
=
all_hidden_states
+
(
hidden_states
.
view
(
*
output_shape
),)
outputs
=
block
(
hidden_states
,
layer_past
,
head_mask
[
i
])
outputs
=
block
(
hidden_states
,
past
=
layer_past
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
[
i
])
hidden_states
,
present
=
outputs
[:
2
]
presents
=
presents
+
(
present
,)
...
...
@@ -601,9 +633,14 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
self
.
transformer
.
wte
)
def
forward
(
self
,
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
labels
=
None
,
past
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
past
=
past
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
past
=
None
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
past
=
past
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
hidden_states
=
transformer_outputs
[
0
]
lm_logits
=
self
.
lm_head
(
hidden_states
)
...
...
@@ -626,33 +663,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence).
"""
,
GPT2_START_DOCSTRING
)
"""
,
GPT2_START_DOCSTRING
,
GPT2_INPUTS_DOCSTRING
)
class
GPT2DoubleHeadsModel
(
GPT2PreTrainedModel
):
r
""" Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
r
"""
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**past**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
...
...
@@ -725,10 +741,15 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
self
.
transformer
.
wte
)
def
forward
(
self
,
input_ids
,
mc_token_ids
=
None
,
lm_labels
=
None
,
mc_labels
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
past
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
past
=
past
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
past
=
None
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
mc_token_ids
=
None
,
lm_labels
=
None
,
mc_labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
past
=
past
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
hidden_states
=
transformer_outputs
[
0
]
lm_logits
=
self
.
lm_head
(
hidden_states
)
...
...
pytorch_transformers/modeling_openai.py
View file @
e25cba78
...
...
@@ -270,7 +270,7 @@ class Attention(nn.Module):
self
.
n_head
=
self
.
n_head
-
len
(
heads
)
self
.
pruned_heads
=
self
.
pruned_heads
.
union
(
heads
)
def
_attn
(
self
,
q
,
k
,
v
,
head_mask
=
None
):
def
_attn
(
self
,
q
,
k
,
v
,
attention_mask
=
None
,
head_mask
=
None
):
w
=
torch
.
matmul
(
q
,
k
)
if
self
.
scale
:
w
=
w
/
math
.
sqrt
(
v
.
size
(
-
1
))
...
...
@@ -279,6 +279,10 @@ class Attention(nn.Module):
b
=
self
.
bias
[:,
:,
:
w
.
size
(
-
2
),
:
w
.
size
(
-
1
)]
w
=
w
*
b
+
-
1e9
*
(
1
-
b
)
if
attention_mask
is
not
None
:
# Apply the attention mask
w
=
w
+
attention_mask
w
=
nn
.
Softmax
(
dim
=-
1
)(
w
)
w
=
self
.
attn_dropout
(
w
)
...
...
@@ -304,14 +308,14 @@ class Attention(nn.Module):
else
:
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
forward
(
self
,
x
,
head_mask
=
None
):
def
forward
(
self
,
x
,
attention_mask
=
None
,
head_mask
=
None
):
x
=
self
.
c_attn
(
x
)
query
,
key
,
value
=
x
.
split
(
self
.
split_size
,
dim
=
2
)
query
=
self
.
split_heads
(
query
)
key
=
self
.
split_heads
(
key
,
k
=
True
)
value
=
self
.
split_heads
(
value
)
attn_outputs
=
self
.
_attn
(
query
,
key
,
value
,
head_mask
)
attn_outputs
=
self
.
_attn
(
query
,
key
,
value
,
attention_mask
,
head_mask
)
a
=
attn_outputs
[
0
]
a
=
self
.
merge_heads
(
a
)
...
...
@@ -346,8 +350,8 @@ class Block(nn.Module):
self
.
mlp
=
MLP
(
4
*
nx
,
config
)
self
.
ln_2
=
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
def
forward
(
self
,
x
,
head_mask
=
None
):
attn_outputs
=
self
.
attn
(
x
,
head_mask
=
head_mask
)
def
forward
(
self
,
x
,
attention_mask
=
None
,
head_mask
=
None
):
attn_outputs
=
self
.
attn
(
x
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
a
=
attn_outputs
[
0
]
n
=
self
.
ln_1
(
x
+
a
)
...
...
@@ -410,13 +414,17 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
...
...
@@ -470,7 +478,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
for
layer
,
heads
in
heads_to_prune
.
items
():
self
.
h
[
layer
].
attn
.
prune_heads
(
heads
)
def
forward
(
self
,
input_ids
,
posi
tion_
ids
=
None
,
token_type_ids
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
atten
tion_
mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
if
position_ids
is
None
:
# This was used when we had a single embedding matrice from position and token embeddings
# start = self.config.vocab_size + self.config.n_special
...
...
@@ -479,6 +487,23 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
position_ids
=
torch
.
arange
(
input_ids
.
size
(
-
1
),
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
# Attention mask.
if
attention_mask
is
not
None
:
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_mask
=
attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
attention_mask
=
(
1.0
-
attention_mask
)
*
-
10000.0
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
...
...
@@ -515,7 +540,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
if
self
.
output_hidden_states
:
all_hidden_states
=
all_hidden_states
+
(
hidden_states
.
view
(
*
output_shape
),)
outputs
=
block
(
hidden_states
,
head_mask
[
i
])
outputs
=
block
(
hidden_states
,
attention_mask
,
head_mask
[
i
])
hidden_states
=
outputs
[
0
]
if
self
.
output_attentions
:
all_attentions
=
all_attentions
+
(
outputs
[
1
],)
...
...
@@ -580,8 +605,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
self
.
transformer
.
tokens_embed
)
def
forward
(
self
,
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
labels
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
hidden_states
=
transformer_outputs
[
0
]
lm_logits
=
self
.
lm_head
(
hidden_states
)
...
...
@@ -604,29 +633,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence).
"""
,
OPENAI_GPT_START_DOCSTRING
)
"""
,
OPENAI_GPT_START_DOCSTRING
,
OPENAI_GPT_INPUTS_DOCSTRING
)
class
OpenAIGPTDoubleHeadsModel
(
OpenAIGPTPreTrainedModel
):
r
""" Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
r
"""
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
...
...
@@ -687,9 +699,12 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
self
.
_tie_or_clone_weights
(
self
.
lm_head
,
self
.
transformer
.
tokens_embed
)
def
forward
(
self
,
input_ids
,
mc_token_ids
=
None
,
lm_labels
=
None
,
mc_labels
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
lm_labels
=
None
,
mc_labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
hidden_states
=
transformer_outputs
[
0
]
...
...
pytorch_transformers/modeling_roberta.py
View file @
e25cba78
...
...
@@ -61,7 +61,9 @@ class RobertaEmbeddings(BertEmbeddings):
# cf. fairseq's `utils.make_positions`
position_ids
=
torch
.
arange
(
self
.
padding_idx
+
1
,
seq_length
+
self
.
padding_idx
+
1
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
return
super
(
RobertaEmbeddings
,
self
).
forward
(
input_ids
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
)
return
super
(
RobertaEmbeddings
,
self
).
forward
(
input_ids
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
)
class
RobertaConfig
(
BertConfig
):
...
...
@@ -116,13 +118,20 @@ ROBERTA_INPUTS_DOCSTRING = r"""
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1[``.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Optional segment token indices to indicate first and second portions of the inputs.
This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
during finetuning.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1[``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
...
...
@@ -170,12 +179,16 @@ class RobertaModel(BertModel):
self
.
embeddings
=
RobertaEmbeddings
(
config
)
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
position_ids
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
if
input_ids
[:,
0
].
sum
().
item
()
!=
0
:
logger
.
warning
(
"A sequence with no special tokens has been passed to the RoBERTa model. "
"This model requires special tokens in order to work. "
"Please specify add_special_tokens=True in your encoding."
)
return
super
(
RobertaModel
,
self
).
forward
(
input_ids
,
token_type_ids
,
attention_mask
,
position_ids
,
head_mask
)
return
super
(
RobertaModel
,
self
).
forward
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
@
add_start_docstrings
(
"""RoBERTa Model with a `language modeling` head on top. """
,
...
...
@@ -229,10 +242,13 @@ class RobertaForMaskedLM(BertPreTrainedModel):
"""
self
.
_tie_or_clone_weights
(
self
.
lm_head
.
decoder
,
self
.
roberta
.
embeddings
.
word_embeddings
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
position_ids
=
None
,
head_mask
=
None
):
outputs
=
self
.
roberta
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
masked_lm_labels
=
None
):
outputs
=
self
.
roberta
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
sequence_output
=
outputs
[
0
]
prediction_scores
=
self
.
lm_head
(
sequence_output
)
...
...
@@ -313,10 +329,13 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
self
.
roberta
=
RobertaModel
(
config
)
self
.
classifier
=
RobertaClassificationHead
(
config
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
position_ids
=
None
,
head_mask
=
None
):
outputs
=
self
.
roberta
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
attention_mask
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
labels
=
None
):
outputs
=
self
.
roberta
(
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
head_mask
=
head_mask
)
sequence_output
=
outputs
[
0
]
logits
=
self
.
classifier
(
sequence_output
)
...
...
pytorch_transformers/modeling_transfo_xl.py
View file @
e25cba78
...
...
@@ -1342,7 +1342,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
def
init_mems
(
self
,
data
):
return
self
.
transformer
.
init_mems
(
data
)
def
forward
(
self
,
input_ids
,
labels
=
None
,
mems
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
mems
=
None
,
head_mask
=
None
,
labels
=
None
):
bsz
=
input_ids
.
size
(
0
)
tgt_len
=
input_ids
.
size
(
1
)
...
...
pytorch_transformers/modeling_xlm.py
View file @
e25cba78
...
...
@@ -441,23 +441,23 @@ XLM_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens to be used to indicate the language of each token in the input.
Indices are languages ids which can be obtained from the language names by using two conversion mappings
provided in the configuration of the model (only provided for multilingual models).
More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Length of each sentence that can be used to avoid performing attention on padding token indices.
You can also use `attention_mask` for the same result (see above), kept here for compatbility.
...
...
@@ -584,8 +584,8 @@ class XLMModel(XLMPreTrainedModel):
for
layer
,
heads
in
heads_to_prune
.
items
():
self
.
attentions
[
layer
].
prune_heads
(
heads
)
def
forward
(
self
,
input_ids
,
lengths
=
None
,
position_ids
=
None
,
lang
s
=
None
,
token_type_ids
=
None
,
attention_mask
=
None
,
cache
=
None
,
head_mask
=
None
):
# src_enc=None, src_len=None
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
langs
=
None
,
token_type_ids
=
None
,
position_id
s
=
None
,
lengths
=
None
,
cache
=
None
,
head_mask
=
None
):
#
removed:
src_enc=None, src_len=None
if
lengths
is
None
:
lengths
=
(
input_ids
!=
self
.
pad_index
).
sum
(
dim
=
1
).
long
()
# mask = input_ids != self.pad_index
...
...
@@ -790,11 +790,16 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
"""
self
.
_tie_or_clone_weights
(
self
.
pred_layer
.
proj
,
self
.
transformer
.
embeddings
)
def
forward
(
self
,
input_ids
,
lengths
=
None
,
position_ids
=
None
,
langs
=
None
,
token_type_ids
=
None
,
attention_mask
=
None
,
cache
=
None
,
labels
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
lengths
=
lengths
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
langs
=
langs
,
attention_mask
=
attention_mask
,
cache
=
cache
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
langs
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
lengths
=
None
,
cache
=
None
,
head_mask
=
None
,
labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
attention_mask
=
attention_mask
,
langs
=
langs
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
lengths
=
lengths
,
cache
=
cache
,
head_mask
=
head_mask
)
output
=
transformer_outputs
[
0
]
outputs
=
self
.
pred_layer
(
output
,
labels
)
...
...
@@ -846,11 +851,16 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
lengths
=
None
,
position_ids
=
None
,
langs
=
None
,
token_type_ids
=
None
,
attention_mask
=
None
,
cache
=
None
,
labels
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
lengths
=
lengths
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
langs
=
langs
,
attention_mask
=
attention_mask
,
cache
=
cache
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
langs
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
lengths
=
None
,
cache
=
None
,
head_mask
=
None
,
labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
attention_mask
=
attention_mask
,
langs
=
langs
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
lengths
=
lengths
,
cache
=
cache
,
head_mask
=
head_mask
)
output
=
transformer_outputs
[
0
]
logits
=
self
.
sequence_summary
(
output
)
...
...
@@ -924,12 +934,17 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
lengths
=
None
,
position_ids
=
None
,
langs
=
None
,
token_type_ids
=
None
,
attention_mask
=
None
,
cache
=
None
,
start_positions
=
None
,
end_positions
=
None
,
cls_index
=
None
,
is_impossible
=
None
,
p_mask
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
lengths
=
lengths
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
,
langs
=
langs
,
attention_mask
=
attention_mask
,
cache
=
cache
,
head_mask
=
head_mask
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
langs
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
lengths
=
None
,
cache
=
None
,
head_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
,
is_impossible
=
None
,
cls_index
=
None
,
p_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
attention_mask
=
attention_mask
,
langs
=
langs
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
lengths
=
lengths
,
cache
=
cache
,
head_mask
=
head_mask
)
output
=
transformer_outputs
[
0
]
...
...
pytorch_transformers/modeling_xlnet.py
View file @
e25cba78
...
...
@@ -647,21 +647,10 @@ XLNET_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
Kept for compatibility with the original code base.
You can only uses one of `input_mask` and `attention_mask`
Mask values selected in ``[0, 1]``:
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
**mems**: (`optional`)
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
...
...
@@ -679,6 +668,17 @@ XLNET_INPUTS_DOCSTRING = r"""
Mask to indicate the output tokens to use.
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
Only used during pretraining for partial prediction or for sequential decoding (generation).
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
Kept for compatibility with the original code base.
You can only uses one of `input_mask` and `attention_mask`
Mask values selected in ``[0, 1]``:
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
...
...
@@ -837,8 +837,8 @@ class XLNetModel(XLNetPreTrainedModel):
pos_emb
=
pos_emb
.
to
(
next
(
self
.
parameters
()))
return
pos_emb
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
input_mask
=
None
,
attention_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
head_mask
=
None
):
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
token_type_ids
=
None
,
input_mask
=
None
,
head_mask
=
None
):
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end
...
...
@@ -1042,12 +1042,15 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
"""
self
.
_tie_or_clone_weights
(
self
.
lm_loss
,
self
.
transformer
.
word_embedding
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
input_mask
=
None
,
attention_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
labels
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
token_type_ids
=
token_type_ids
,
input_mask
=
input_mask
,
attention_mask
=
attention_mask
,
mems
=
mems
,
perm_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
token_type_ids
=
None
,
input_mask
=
None
,
head_mask
=
None
,
labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
attention_mask
=
attention_mask
,
mems
=
mems
,
perm_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
token_type_ids
=
token_type_ids
,
input_mask
=
input_mask
,
head_mask
=
head_mask
)
logits
=
self
.
lm_loss
(
transformer_outputs
[
0
])
...
...
@@ -1113,12 +1116,15 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
input_mask
=
None
,
attention_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
labels
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
token_type_ids
=
token_type_ids
,
input_mask
=
input_mask
,
attention_mask
=
attention_mask
,
mems
=
mems
,
perm_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
token_type_ids
=
None
,
input_mask
=
None
,
head_mask
=
None
,
labels
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
attention_mask
=
attention_mask
,
mems
=
mems
,
perm_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
token_type_ids
=
token_type_ids
,
input_mask
=
input_mask
,
head_mask
=
head_mask
)
output
=
transformer_outputs
[
0
]
...
...
@@ -1215,13 +1221,16 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
self
.
init_weights
()
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
input_mask
=
None
,
attention_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
start_positions
=
None
,
end_positions
=
None
,
cls_index
=
None
,
is_impossible
=
None
,
p_mask
=
None
,
head_mask
=
None
):
transformer_outputs
=
self
.
transformer
(
input_ids
,
token_type_ids
=
token_type_ids
,
input_mask
=
input_mask
,
attention_mask
=
attention_mask
,
mems
=
mems
,
perm_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
token_type_ids
=
None
,
input_mask
=
None
,
head_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
,
is_impossible
=
None
,
cls_index
=
None
,
p_mask
=
None
,):
transformer_outputs
=
self
.
transformer
(
input_ids
,
attention_mask
=
attention_mask
,
mems
=
mems
,
perm_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
token_type_ids
=
token_type_ids
,
input_mask
=
input_mask
,
head_mask
=
head_mask
)
hidden_states
=
transformer_outputs
[
0
]
start_logits
=
self
.
start_logits
(
hidden_states
,
p_mask
=
p_mask
)
...
...
pytorch_transformers/tests/modeling_bert_test.py
View file @
e25cba78
...
...
@@ -126,8 +126,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_model
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertModel
(
config
=
config
)
model
.
eval
()
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
,
input_mask
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
,
token_type_ids
=
token_type_ids
)
sequence_output
,
pooled_output
=
model
(
input_ids
)
result
=
{
...
...
@@ -143,7 +143,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_masked_lm
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForMaskedLM
(
config
=
config
)
model
.
eval
()
loss
,
prediction_scores
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
token_labels
)
loss
,
prediction_scores
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
)
result
=
{
"loss"
:
loss
,
"prediction_scores"
:
prediction_scores
,
...
...
@@ -156,7 +156,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_next_sequence_prediction
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForNextSentencePrediction
(
config
=
config
)
model
.
eval
()
loss
,
seq_relationship_score
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
)
loss
,
seq_relationship_score
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
next_sentence_label
=
sequence_labels
)
result
=
{
"loss"
:
loss
,
"seq_relationship_score"
:
seq_relationship_score
,
...
...
@@ -170,7 +170,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_pretraining
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForPreTraining
(
config
=
config
)
model
.
eval
()
loss
,
prediction_scores
,
seq_relationship_score
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
token_labels
,
sequence_labels
)
loss
,
prediction_scores
,
seq_relationship_score
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
masked_lm_labels
=
token_labels
,
next_sentence_label
=
sequence_labels
)
result
=
{
"loss"
:
loss
,
"prediction_scores"
:
prediction_scores
,
...
...
@@ -188,7 +189,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_bert_for_question_answering
(
self
,
config
,
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
BertForQuestionAnswering
(
config
=
config
)
model
.
eval
()
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
,
sequence_labels
)
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
start_positions
=
sequence_labels
,
end_positions
=
sequence_labels
)
result
=
{
"loss"
:
loss
,
"start_logits"
:
start_logits
,
...
...
@@ -207,7 +209,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
config
.
num_labels
=
self
.
num_labels
model
=
BertForSequenceClassification
(
config
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
sequence_labels
)
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
labels
=
sequence_labels
)
result
=
{
"loss"
:
loss
,
"logits"
:
logits
,
...
...
@@ -222,7 +224,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
config
.
num_labels
=
self
.
num_labels
model
=
BertForTokenClassification
(
config
=
config
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
token_type_ids
,
input_mask
,
token_labels
)
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
token_type_ids
=
token_type_ids
,
labels
=
token_labels
)
result
=
{
"loss"
:
loss
,
"logits"
:
logits
,
...
...
@@ -241,9 +243,9 @@ class BertModelTest(CommonTestCases.CommonModelTester):
multiple_choice_token_type_ids
=
token_type_ids
.
unsqueeze
(
1
).
expand
(
-
1
,
self
.
num_choices
,
-
1
).
contiguous
()
multiple_choice_input_mask
=
input_mask
.
unsqueeze
(
1
).
expand
(
-
1
,
self
.
num_choices
,
-
1
).
contiguous
()
loss
,
logits
=
model
(
multiple_choice_inputs_ids
,
multiple_choice_
token_type_ids
,
multiple_choice_
input_mask
,
choice_labels
)
attention_mask
=
multiple_choice_
input_mask
,
token_type_ids
=
multiple_choice_
token_type_ids
,
labels
=
choice_labels
)
result
=
{
"loss"
:
loss
,
"logits"
:
logits
,
...
...
pytorch_transformers/tests/modeling_distilbert_test.py
View file @
e25cba78
...
...
@@ -148,7 +148,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
def
create_and_check_distilbert_for_question_answering
(
self
,
config
,
input_ids
,
input_mask
,
sequence_labels
,
token_labels
,
choice_labels
):
model
=
DistilBertForQuestionAnswering
(
config
=
config
)
model
.
eval
()
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
input_mask
,
sequence_labels
,
sequence_labels
)
loss
,
start_logits
,
end_logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
start_positions
=
sequence_labels
,
end_positions
=
sequence_labels
)
result
=
{
"loss"
:
loss
,
"start_logits"
:
start_logits
,
...
...
@@ -166,7 +166,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
config
.
num_labels
=
self
.
num_labels
model
=
DistilBertForSequenceClassification
(
config
)
model
.
eval
()
loss
,
logits
=
model
(
input_ids
,
input_mask
,
sequence_labels
)
loss
,
logits
=
model
(
input_ids
,
attention_mask
=
input_mask
,
labels
=
sequence_labels
)
result
=
{
"loss"
:
loss
,
"logits"
:
logits
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment