Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
3bf54172
Commit
3bf54172
authored
Feb 04, 2020
by
Lysandre
Browse files
Revert erroneous fix
parent
1ebfeb79
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
37 additions
and
37 deletions
+37
-37
examples/contrib/run_openai_gpt.py
examples/contrib/run_openai_gpt.py
+1
-1
examples/distillation/distiller.py
examples/distillation/distiller.py
+5
-5
examples/run_lm_finetuning.py
examples/run_lm_finetuning.py
+1
-1
src/transformers/modeling_albert.py
src/transformers/modeling_albert.py
+2
-2
src/transformers/modeling_bert.py
src/transformers/modeling_bert.py
+7
-7
src/transformers/modeling_ctrl.py
src/transformers/modeling_ctrl.py
+2
-2
src/transformers/modeling_distilbert.py
src/transformers/modeling_distilbert.py
+2
-2
src/transformers/modeling_gpt2.py
src/transformers/modeling_gpt2.py
+3
-3
src/transformers/modeling_openai.py
src/transformers/modeling_openai.py
+3
-3
src/transformers/modeling_roberta.py
src/transformers/modeling_roberta.py
+2
-2
src/transformers/modeling_t5.py
src/transformers/modeling_t5.py
+3
-3
src/transformers/modeling_transfo_xl.py
src/transformers/modeling_transfo_xl.py
+2
-2
src/transformers/modeling_xlm.py
src/transformers/modeling_xlm.py
+2
-2
src/transformers/modeling_xlnet.py
src/transformers/modeling_xlnet.py
+2
-2
No files found.
examples/contrib/run_openai_gpt.py
View file @
3bf54172
...
@@ -81,7 +81,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
...
@@ -81,7 +81,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
n_batch
=
len
(
dataset
)
n_batch
=
len
(
dataset
)
input_ids
=
np
.
zeros
((
n_batch
,
2
,
input_len
),
dtype
=
np
.
int64
)
input_ids
=
np
.
zeros
((
n_batch
,
2
,
input_len
),
dtype
=
np
.
int64
)
mc_token_ids
=
np
.
zeros
((
n_batch
,
2
),
dtype
=
np
.
int64
)
mc_token_ids
=
np
.
zeros
((
n_batch
,
2
),
dtype
=
np
.
int64
)
lm_labels
=
np
.
full
((
n_batch
,
2
,
input_len
),
fill_value
=-
1
,
dtype
=
np
.
int64
)
lm_labels
=
np
.
full
((
n_batch
,
2
,
input_len
),
fill_value
=-
1
00
,
dtype
=
np
.
int64
)
mc_labels
=
np
.
zeros
((
n_batch
,),
dtype
=
np
.
int64
)
mc_labels
=
np
.
zeros
((
n_batch
,),
dtype
=
np
.
int64
)
for
i
,
(
story
,
cont1
,
cont2
,
mc_label
),
in
enumerate
(
dataset
):
for
i
,
(
story
,
cont1
,
cont2
,
mc_label
),
in
enumerate
(
dataset
):
with_cont1
=
[
start_token
]
+
story
[:
cap_length
]
+
[
delimiter_token
]
+
cont1
[:
cap_length
]
+
[
clf_token
]
with_cont1
=
[
start_token
]
+
story
[:
cap_length
]
+
[
delimiter_token
]
+
cont1
[:
cap_length
]
+
[
clf_token
]
...
...
examples/distillation/distiller.py
View file @
3bf54172
...
@@ -109,7 +109,7 @@ class Distiller:
...
@@ -109,7 +109,7 @@ class Distiller:
self
.
last_log
=
0
self
.
last_log
=
0
self
.
ce_loss_fct
=
nn
.
KLDivLoss
(
reduction
=
"batchmean"
)
self
.
ce_loss_fct
=
nn
.
KLDivLoss
(
reduction
=
"batchmean"
)
self
.
lm_loss_fct
=
nn
.
CrossEntropyLoss
()
self
.
lm_loss_fct
=
nn
.
CrossEntropyLoss
(
ignore_index
=-
100
)
if
self
.
alpha_mse
>
0.0
:
if
self
.
alpha_mse
>
0.0
:
self
.
mse_loss_fct
=
nn
.
MSELoss
(
reduction
=
"sum"
)
self
.
mse_loss_fct
=
nn
.
MSELoss
(
reduction
=
"sum"
)
if
self
.
alpha_cos
>
0.0
:
if
self
.
alpha_cos
>
0.0
:
...
@@ -200,7 +200,7 @@ class Distiller:
...
@@ -200,7 +200,7 @@ class Distiller:
-------
-------
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1 where there is nothing to predict.
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1
00
where there is nothing to predict.
"""
"""
token_ids
,
lengths
=
batch
token_ids
,
lengths
=
batch
token_ids
,
lengths
=
self
.
round_batch
(
x
=
token_ids
,
lengths
=
lengths
)
token_ids
,
lengths
=
self
.
round_batch
(
x
=
token_ids
,
lengths
=
lengths
)
...
@@ -244,7 +244,7 @@ class Distiller:
...
@@ -244,7 +244,7 @@ class Distiller:
)
)
token_ids
=
token_ids
.
masked_scatter
(
pred_mask
,
_token_ids
)
token_ids
=
token_ids
.
masked_scatter
(
pred_mask
,
_token_ids
)
mlm_labels
[
~
pred_mask
]
=
-
1
# previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
mlm_labels
[
~
pred_mask
]
=
-
1
00
# previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
# sanity checks
# sanity checks
assert
0
<=
token_ids
.
min
()
<=
token_ids
.
max
()
<
self
.
vocab_size
assert
0
<=
token_ids
.
min
()
<=
token_ids
.
max
()
<
self
.
vocab_size
...
@@ -265,7 +265,7 @@ class Distiller:
...
@@ -265,7 +265,7 @@ class Distiller:
-------
-------
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict.
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1
00
where there is nothing to predict.
"""
"""
token_ids
,
lengths
=
batch
token_ids
,
lengths
=
batch
token_ids
,
lengths
=
self
.
round_batch
(
x
=
token_ids
,
lengths
=
lengths
)
token_ids
,
lengths
=
self
.
round_batch
(
x
=
token_ids
,
lengths
=
lengths
)
...
@@ -273,7 +273,7 @@ class Distiller:
...
@@ -273,7 +273,7 @@ class Distiller:
attn_mask
=
torch
.
arange
(
token_ids
.
size
(
1
),
dtype
=
torch
.
long
,
device
=
lengths
.
device
)
<
lengths
[:,
None
]
attn_mask
=
torch
.
arange
(
token_ids
.
size
(
1
),
dtype
=
torch
.
long
,
device
=
lengths
.
device
)
<
lengths
[:,
None
]
clm_labels
=
token_ids
.
new
(
token_ids
.
size
()).
copy_
(
token_ids
)
clm_labels
=
token_ids
.
new
(
token_ids
.
size
()).
copy_
(
token_ids
)
clm_labels
[
~
attn_mask
]
=
-
1
# previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
clm_labels
[
~
attn_mask
]
=
-
1
00
# previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
# sanity checks
# sanity checks
assert
0
<=
token_ids
.
min
()
<=
token_ids
.
max
()
<
self
.
vocab_size
assert
0
<=
token_ids
.
min
()
<=
token_ids
.
max
()
<
self
.
vocab_size
...
...
examples/run_lm_finetuning.py
View file @
3bf54172
...
@@ -207,7 +207,7 @@ def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> T
...
@@ -207,7 +207,7 @@ def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> T
padding_mask
=
labels
.
eq
(
tokenizer
.
pad_token_id
)
padding_mask
=
labels
.
eq
(
tokenizer
.
pad_token_id
)
probability_matrix
.
masked_fill_
(
padding_mask
,
value
=
0.0
)
probability_matrix
.
masked_fill_
(
padding_mask
,
value
=
0.0
)
masked_indices
=
torch
.
bernoulli
(
probability_matrix
).
bool
()
masked_indices
=
torch
.
bernoulli
(
probability_matrix
).
bool
()
labels
[
~
masked_indices
]
=
-
1
# We only compute loss on masked tokens
labels
[
~
masked_indices
]
=
-
1
00
# We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced
=
torch
.
bernoulli
(
torch
.
full
(
labels
.
shape
,
0.8
)).
bool
()
&
masked_indices
indices_replaced
=
torch
.
bernoulli
(
torch
.
full
(
labels
.
shape
,
0.8
)).
bool
()
&
masked_indices
...
...
src/transformers/modeling_albert.py
View file @
3bf54172
...
@@ -632,8 +632,8 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
...
@@ -632,8 +632,8 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
r
"""
r
"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Indices should be in ``[-1
00
, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with
labels in ``[0, ..., config.vocab_size]``
labels in ``[0, ..., config.vocab_size]``
Returns:
Returns:
...
...
src/transformers/modeling_bert.py
View file @
3bf54172
...
@@ -846,8 +846,8 @@ class BertForPreTraining(BertPreTrainedModel):
...
@@ -846,8 +846,8 @@ class BertForPreTraining(BertPreTrainedModel):
r
"""
r
"""
masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Indices should be in ``[-1
00
, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
in ``[0, ..., config.vocab_size]``
next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
...
@@ -948,13 +948,13 @@ class BertForMaskedLM(BertPreTrainedModel):
...
@@ -948,13 +948,13 @@ class BertForMaskedLM(BertPreTrainedModel):
r
"""
r
"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Indices should be in ``[-1
00
, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
in ``[0, ..., config.vocab_size]``
lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the left-to-right language modeling loss (next word prediction).
Labels for computing the left-to-right language modeling loss (next word prediction).
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Indices should be in ``[-1
00
, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
in ``[0, ..., config.vocab_size]``
Returns:
Returns:
...
@@ -1015,7 +1015,7 @@ class BertForMaskedLM(BertPreTrainedModel):
...
@@ -1015,7 +1015,7 @@ class BertForMaskedLM(BertPreTrainedModel):
# 2. If `lm_labels` is provided we are in a causal scenario where we
# 2. If `lm_labels` is provided we are in a causal scenario where we
# try to predict the next token for each input in the decoder.
# try to predict the next token for each input in the decoder.
if
masked_lm_labels
is
not
None
:
if
masked_lm_labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
# -1 index = padding token
loss_fct
=
CrossEntropyLoss
()
# -1
00
index = padding token
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
masked_lm_labels
.
view
(
-
1
))
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
masked_lm_labels
.
view
(
-
1
))
outputs
=
(
masked_lm_loss
,)
+
outputs
outputs
=
(
masked_lm_loss
,)
+
outputs
...
...
src/transformers/modeling_ctrl.py
View file @
3bf54172
...
@@ -479,8 +479,8 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
...
@@ -479,8 +479,8 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
Indices are selected in ``[-1
00
, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
computed for labels in ``[0, ..., config.vocab_size]``
Return:
Return:
...
...
src/transformers/modeling_distilbert.py
View file @
3bf54172
...
@@ -517,8 +517,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
...
@@ -517,8 +517,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
r
"""
r
"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Indices should be in ``[-1
00
, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
in ``[0, ..., config.vocab_size]``
Returns:
Returns:
...
...
src/transformers/modeling_gpt2.py
View file @
3bf54172
...
@@ -547,8 +547,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
...
@@ -547,8 +547,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
Indices are selected in ``[-1
00
, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
computed for labels in ``[0, ..., config.vocab_size]``
Return:
Return:
...
@@ -655,7 +655,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
...
@@ -655,7 +655,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
Labels for language modeling.
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
computed for labels in ``[0, ..., config.vocab_size]``
mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
Labels for computing the multiple choice classification loss.
Labels for computing the multiple choice classification loss.
...
...
src/transformers/modeling_openai.py
View file @
3bf54172
...
@@ -516,8 +516,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
...
@@ -516,8 +516,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
Indices are selected in ``[-1
00
, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
computed for labels in ``[0, ..., config.vocab_size]``
Return:
Return:
...
@@ -621,7 +621,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
...
@@ -621,7 +621,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
Labels for language modeling.
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
computed for labels in ``[0, ..., config.vocab_size]``
mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
Labels for computing the multiple choice classification loss.
Labels for computing the multiple choice classification loss.
...
...
src/transformers/modeling_roberta.py
View file @
3bf54172
...
@@ -200,8 +200,8 @@ class RobertaForMaskedLM(BertPreTrainedModel):
...
@@ -200,8 +200,8 @@ class RobertaForMaskedLM(BertPreTrainedModel):
r
"""
r
"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Indices should be in ``[-1
00
, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
in ``[0, ..., config.vocab_size]``
Returns:
Returns:
...
...
src/transformers/modeling_t5.py
View file @
3bf54172
...
@@ -802,8 +802,8 @@ class T5WithLMHeadModel(T5PreTrainedModel):
...
@@ -802,8 +802,8 @@ class T5WithLMHeadModel(T5PreTrainedModel):
r
"""
r
"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the masked language modeling loss.
Labels for computing the masked language modeling loss.
Indices should either be in ``[0, ..., config.vocab_size]`` or -1 (see ``input_ids`` docstring).
Indices should either be in ``[0, ..., config.vocab_size]`` or -1
00
(see ``input_ids`` docstring).
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
Tokens with indices set to ``-1
00
`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``.
in ``[0, ..., config.vocab_size]``.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
...
@@ -906,7 +906,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
...
@@ -906,7 +906,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
if
lm_labels
is
not
None
:
if
lm_labels
is
not
None
:
shift_logits
=
lm_logits
[...,
:
-
1
,
:].
contiguous
()
shift_logits
=
lm_logits
[...,
:
-
1
,
:].
contiguous
()
shift_labels
=
lm_labels
[...,
1
:].
contiguous
()
shift_labels
=
lm_labels
[...,
1
:].
contiguous
()
loss_fct
=
CrossEntropyLoss
()
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
100
)
loss
=
loss_fct
(
shift_logits
.
view
(
-
1
,
shift_logits
.
size
(
-
1
)),
shift_labels
.
view
(
-
1
))
loss
=
loss_fct
(
shift_logits
.
view
(
-
1
,
shift_logits
.
size
(
-
1
)),
shift_labels
.
view
(
-
1
))
decoder_outputs
=
(
decoder_outputs
=
(
loss
,
loss
,
...
...
src/transformers/modeling_transfo_xl.py
View file @
3bf54172
...
@@ -858,8 +858,8 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
...
@@ -858,8 +858,8 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
Indices are selected in ``[-1
00
, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
computed for labels in ``[0, ..., config.vocab_size]``
Return:
Return:
...
...
src/transformers/modeling_xlm.py
View file @
3bf54172
...
@@ -667,8 +667,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
...
@@ -667,8 +667,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
Indices are selected in ``[-1
00
, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
computed for labels in ``[0, ..., config.vocab_size]``
Return:
Return:
...
...
src/transformers/modeling_xlnet.py
View file @
3bf54172
...
@@ -993,8 +993,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
...
@@ -993,8 +993,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling.
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
Indices are selected in ``[-1
00
, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
All labels set to ``-1
00
`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
computed for labels in ``[0, ..., config.vocab_size]``
Return:
Return:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment