Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
3bf54172
Commit
3bf54172
authored
Feb 04, 2020
by
Lysandre
Browse files
Revert erroneous fix
parent
1ebfeb79
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
37 additions
and
37 deletions
+37
-37
examples/contrib/run_openai_gpt.py
examples/contrib/run_openai_gpt.py
+1
-1
examples/distillation/distiller.py
examples/distillation/distiller.py
+5
-5
examples/run_lm_finetuning.py
examples/run_lm_finetuning.py
+1
-1
src/transformers/modeling_albert.py
src/transformers/modeling_albert.py
+2
-2
src/transformers/modeling_bert.py
src/transformers/modeling_bert.py
+7
-7
src/transformers/modeling_ctrl.py
src/transformers/modeling_ctrl.py
+2
-2
src/transformers/modeling_distilbert.py
src/transformers/modeling_distilbert.py
+2
-2
src/transformers/modeling_gpt2.py
src/transformers/modeling_gpt2.py
+3
-3
src/transformers/modeling_openai.py
src/transformers/modeling_openai.py
+3
-3
src/transformers/modeling_roberta.py
src/transformers/modeling_roberta.py
+2
-2
src/transformers/modeling_t5.py
src/transformers/modeling_t5.py
+3
-3
src/transformers/modeling_transfo_xl.py
src/transformers/modeling_transfo_xl.py
+2
-2
src/transformers/modeling_xlm.py
src/transformers/modeling_xlm.py
+2
-2
src/transformers/modeling_xlnet.py
src/transformers/modeling_xlnet.py
+2
-2
No files found.
examples/contrib/run_openai_gpt.py
View file @
3bf54172
...
...
@@ -81,7 +81,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
n_batch
=
len
(
dataset
)
input_ids
=
np
.
zeros
((
n_batch
,
2
,
input_len
),
dtype
=
np
.
int64
)
mc_token_ids
=
np
.
zeros
((
n_batch
,
2
),
dtype
=
np
.
int64
)
lm_labels
=
np
.
full
((
n_batch
,
2
,
input_len
),
fill_value
=-
1
,
dtype
=
np
.
int64
)
lm_labels
=
np
.
full
((
n_batch
,
2
,
input_len
),
fill_value
=-
1
00
,
dtype
=
np
.
int64
)
mc_labels
=
np
.
zeros
((
n_batch
,),
dtype
=
np
.
int64
)
for
i
,
(
story
,
cont1
,
cont2
,
mc_label
),
in
enumerate
(
dataset
):
with_cont1
=
[
start_token
]
+
story
[:
cap_length
]
+
[
delimiter_token
]
+
cont1
[:
cap_length
]
+
[
clf_token
]
...
...
examples/distillation/distiller.py
View file @
3bf54172
...
...
@@ -109,7 +109,7 @@ class Distiller:
self
.
last_log
=
0
self
.
ce_loss_fct
=
nn
.
KLDivLoss
(
reduction
=
"batchmean"
)
self
.
lm_loss_fct
=
nn
.
CrossEntropyLoss
()
self
.
lm_loss_fct
=
nn
.
CrossEntropyLoss
(
ignore_index
=-
100
)
if
self
.
alpha_mse
>
0.0
:
self
.
mse_loss_fct
=
nn
.
MSELoss
(
reduction
=
"sum"
)
if
self
.
alpha_cos
>
0.0
:
...
...
@@ -200,7 +200,7 @@ class Distiller:
-------
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1 where there is nothing to predict.
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1
00
where there is nothing to predict.
"""
token_ids
,
lengths
=
batch
token_ids
,
lengths
=
self
.
round_batch
(
x
=
token_ids
,
lengths
=
lengths
)
...
...
@@ -244,7 +244,7 @@ class Distiller:
)
token_ids
=
token_ids
.
masked_scatter
(
pred_mask
,
_token_ids
)
mlm_labels
[
~
pred_mask
]
=
-
1
# previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
mlm_labels
[
~
pred_mask
]
=
-
1
00
# previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
# sanity checks
assert
0
<=
token_ids
.
min
()
<=
token_ids
.
max
()
<
self
.
vocab_size
...
...
@@ -265,7 +265,7 @@ class Distiller:
-------
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict.
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1
00
where there is nothing to predict.
"""
token_ids
,
lengths
=
batch
token_ids
,
lengths
=
self
.
round_batch
(
x
=
token_ids
,
lengths
=
lengths
)
...
...
@@ -273,7 +273,7 @@ class Distiller:
attn_mask
=
torch
.
arange
(
token_ids
.
size
(
1
),
dtype
=
torch
.
long
,
device
=
lengths
.
device
)
<
lengths
[:,
None
]
clm_labels
=
token_ids
.
new
(
token_ids
.
size
()).
copy_
(
token_ids
)
clm_labels
[
~
attn_mask
]
=
-
1
# previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
clm_labels
[
~
attn_mask
]
=
-
1
00
# previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
# sanity checks
assert
0
<=
token_ids
.
min
()
<=
token_ids
.
max
()
<
self
.
vocab_size
...
...
examples/run_lm_finetuning.py
View file @
3bf54172
...
...
@@ -207,7 +207,7 @@ def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> T
padding_mask
=
labels
.
eq
(
tokenizer
.
pad_token_id
)
probability_matrix
.
masked_fill_
(
padding_mask
,
value
=
0.0
)
masked_indices
=
torch
.
bernoulli
(
probability_matrix
).
bool
()
labels
[
~
masked_indices
]
=
-
1
# We only compute loss on masked tokens
labels
[
~
masked_indices
]
=
-
1
00
# We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced
=
torch
.
bernoulli
(
torch
.
full
(
labels
.
shape
,
0.8
)).
bool
()
&
masked_indices
...
...
src/transformers/modeling_albert.py
View file @
3bf54172
...
...
@@ -632,8 +632,8 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
r
"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with
Indices should be in ``[-1
00
, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with
labels in ``[0, ..., config.vocab_size]``
Returns:
...
...
src/transformers/modeling_bert.py
View file @
3bf54172
...
...
@@ -846,8 +846,8 @@ class BertForPreTraining(BertPreTrainedModel):
r
"""
masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
Indices should be in ``[-1
00
, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
...
...
@@ -948,13 +948,13 @@ class BertForMaskedLM(BertPreTrainedModel):
r
"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
Indices should be in ``[-1
00
, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the left-to-right language modeling loss (next word prediction).
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
Indices should be in ``[-1
00
, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Returns:
...
...
@@ -1015,7 +1015,7 @@ class BertForMaskedLM(BertPreTrainedModel):
# 2. If `lm_labels` is provided we are in a causal scenario where we
# try to predict the next token for each input in the decoder.
if
masked_lm_labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
# -1 index = padding token
loss_fct
=
CrossEntropyLoss
()
# -1
00
index = padding token
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
masked_lm_labels
.
view
(
-
1
))
outputs
=
(
masked_lm_loss
,)
+
outputs
...
...
src/transformers/modeling_ctrl.py
View file @
3bf54172
...
...
@@ -479,8 +479,8 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
Indices are selected in ``[-1
00
, 0, ..., config.vocab_size]``
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Return:
...
...
src/transformers/modeling_distilbert.py
View file @
3bf54172
...
...
@@ -517,8 +517,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
r
"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
Indices should be in ``[-1
00
, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Returns:
...
...
src/transformers/modeling_gpt2.py
View file @
3bf54172
...
...
@@ -547,8 +547,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
Indices are selected in ``[-1
00
, 0, ..., config.vocab_size]``
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Return:
...
...
@@ -655,7 +655,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
Labels for computing the multiple choice classification loss.
...
...
src/transformers/modeling_openai.py
View file @
3bf54172
...
...
@@ -516,8 +516,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
Indices are selected in ``[-1
00
, 0, ..., config.vocab_size]``
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Return:
...
...
@@ -621,7 +621,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
Labels for computing the multiple choice classification loss.
...
...
src/transformers/modeling_roberta.py
View file @
3bf54172
...
...
@@ -200,8 +200,8 @@ class RobertaForMaskedLM(BertPreTrainedModel):
r
"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
Indices should be in ``[-1
00
, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Returns:
...
...
src/transformers/modeling_t5.py
View file @
3bf54172
...
...
@@ -802,8 +802,8 @@ class T5WithLMHeadModel(T5PreTrainedModel):
r
"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the masked language modeling loss.
Indices should either be in ``[0, ..., config.vocab_size]`` or -1 (see ``input_ids`` docstring).
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
Indices should either be in ``[0, ..., config.vocab_size]`` or -1
00
(see ``input_ids`` docstring).
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
...
...
@@ -906,7 +906,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
if
lm_labels
is
not
None
:
shift_logits
=
lm_logits
[...,
:
-
1
,
:].
contiguous
()
shift_labels
=
lm_labels
[...,
1
:].
contiguous
()
loss_fct
=
CrossEntropyLoss
()
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
100
)
loss
=
loss_fct
(
shift_logits
.
view
(
-
1
,
shift_logits
.
size
(
-
1
)),
shift_labels
.
view
(
-
1
))
decoder_outputs
=
(
loss
,
...
...
src/transformers/modeling_transfo_xl.py
View file @
3bf54172
...
...
@@ -858,8 +858,8 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
Indices are selected in ``[-1
00
, 0, ..., config.vocab_size]``
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Return:
...
...
src/transformers/modeling_xlm.py
View file @
3bf54172
...
...
@@ -667,8 +667,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
Indices are selected in ``[-1
00
, 0, ..., config.vocab_size]``
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Return:
...
...
src/transformers/modeling_xlnet.py
View file @
3bf54172
...
...
@@ -993,8 +993,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
Indices are selected in ``[-1
00
, 0, ..., config.vocab_size]``
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
Return:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment