Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
6b4c3ee2
Commit
6b4c3ee2
authored
Jan 27, 2020
by
Julien Chaumond
Browse files
[run_lm_finetuning] GPT2 tokenizer doesn't have a pad_token
ping @lysandrejik
parent
79815bf6
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
7 additions
and
2 deletions
+7
-2
examples/run_lm_finetuning.py
examples/run_lm_finetuning.py
+7
-2
No files found.
examples/run_lm_finetuning.py
View file @
6b4c3ee2
...
...
@@ -202,6 +202,7 @@ def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> T
tokenizer
.
get_special_tokens_mask
(
val
,
already_has_special_tokens
=
True
)
for
val
in
labels
.
tolist
()
]
probability_matrix
.
masked_fill_
(
torch
.
tensor
(
special_tokens_mask
,
dtype
=
torch
.
bool
),
value
=
0.0
)
if
tokenizer
.
_pad_token
is
not
None
:
padding_mask
=
labels
.
eq
(
tokenizer
.
pad_token_id
)
probability_matrix
.
masked_fill_
(
padding_mask
,
value
=
0.0
)
masked_indices
=
torch
.
bernoulli
(
probability_matrix
).
bool
()
...
...
@@ -228,6 +229,8 @@ def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedToke
args
.
train_batch_size
=
args
.
per_gpu_train_batch_size
*
max
(
1
,
args
.
n_gpu
)
def
collate
(
examples
:
List
[
torch
.
Tensor
]):
if
tokenizer
.
_pad_token
is
None
:
return
pad_sequence
(
examples
,
batch_first
=
True
)
return
pad_sequence
(
examples
,
batch_first
=
True
,
padding_value
=
tokenizer
.
pad_token_id
)
train_sampler
=
RandomSampler
(
train_dataset
)
if
args
.
local_rank
==
-
1
else
DistributedSampler
(
train_dataset
)
...
...
@@ -421,6 +424,8 @@ def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefi
# Note that DistributedSampler samples randomly
def
collate
(
examples
:
List
[
torch
.
Tensor
]):
if
tokenizer
.
_pad_token
is
None
:
return
pad_sequence
(
examples
,
batch_first
=
True
)
return
pad_sequence
(
examples
,
batch_first
=
True
,
padding_value
=
tokenizer
.
pad_token_id
)
eval_sampler
=
SequentialSampler
(
eval_dataset
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment