Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
367f497d
Unverified
Commit
367f497d
authored
Nov 23, 2020
by
Sylvain Gugger
Committed by
GitHub
Nov 23, 2020
Browse files
Fix max length in run_plm script (#8738)
parent
e84786aa
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
9 additions
and
12 deletions
+9
-12
examples/language-modeling/run_plm.py
examples/language-modeling/run_plm.py
+9
-12
No files found.
examples/language-modeling/run_plm.py
View file @
367f497d
...
@@ -93,11 +93,11 @@ class DataTrainingArguments:
...
@@ -93,11 +93,11 @@ class DataTrainingArguments:
overwrite_cache
:
bool
=
field
(
overwrite_cache
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Overwrite the cached training and evaluation sets"
}
default
=
False
,
metadata
=
{
"help"
:
"Overwrite the cached training and evaluation sets"
}
)
)
max_seq_length
:
Optional
[
int
]
=
field
(
max_seq_length
:
int
=
field
(
default
=
None
,
default
=
512
,
metadata
=
{
metadata
=
{
"help"
:
"The maximum total input sequence length after tokenization. Sequences longer "
"help"
:
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated.
Default to the max input length of the model.
"
"than this will be truncated."
},
},
)
)
preprocessing_num_workers
:
Optional
[
int
]
=
field
(
preprocessing_num_workers
:
Optional
[
int
]
=
field
(
...
@@ -286,15 +286,12 @@ def main():
...
@@ -286,15 +286,12 @@ def main():
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
)
)
if
data_args
.
max_seq_length
is
None
:
if
data_args
.
max_seq_length
>
tokenizer
.
model_max_length
:
max_seq_length
=
tokenizer
.
model_max_length
logger
.
warn
(
else
:
f
"The max_seq_length passed (
{
data_args
.
max_seq_length
}
) is larger than the maximum length for the"
if
data_args
.
max_seq_length
>
tokenizer
.
model_max_length
:
f
"model (
{
tokenizer
.
model_max_length
}
). Using max_seq_length=
{
tokenizer
.
model_max_length
}
."
logger
.
warn
(
)
f
"The max_seq_length passed (
{
data_args
.
max_seq_length
}
) is larger than the maximum length for the"
max_seq_length
=
min
(
data_args
.
max_seq_length
,
tokenizer
.
model_max_length
)
f
"model (
{
tokenizer
.
model_max_length
}
). Using max_seq_length=
{
tokenizer
.
model_max_length
}
."
)
max_seq_length
=
min
(
data_args
.
max_seq_length
,
tokenizer
.
model_max_length
)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# max_seq_length.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment