Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
367f497d
"docs/vscode:/vscode.git/clone" did not exist on "53c8147afe7c7713f6e20f1f391c477e9cb4a7a1"
Unverified
Commit
367f497d
authored
Nov 23, 2020
by
Sylvain Gugger
Committed by
GitHub
Nov 23, 2020
Browse files
Fix max length in run_plm script (#8738)
parent
e84786aa
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
9 additions
and
12 deletions
+9
-12
examples/language-modeling/run_plm.py
examples/language-modeling/run_plm.py
+9
-12
No files found.
examples/language-modeling/run_plm.py
View file @
367f497d
...
@@ -93,11 +93,11 @@ class DataTrainingArguments:
...
@@ -93,11 +93,11 @@ class DataTrainingArguments:
overwrite_cache
:
bool
=
field
(
overwrite_cache
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Overwrite the cached training and evaluation sets"
}
default
=
False
,
metadata
=
{
"help"
:
"Overwrite the cached training and evaluation sets"
}
)
)
max_seq_length
:
Optional
[
int
]
=
field
(
max_seq_length
:
int
=
field
(
default
=
None
,
default
=
512
,
metadata
=
{
metadata
=
{
"help"
:
"The maximum total input sequence length after tokenization. Sequences longer "
"help"
:
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated.
Default to the max input length of the model.
"
"than this will be truncated."
},
},
)
)
preprocessing_num_workers
:
Optional
[
int
]
=
field
(
preprocessing_num_workers
:
Optional
[
int
]
=
field
(
...
@@ -286,15 +286,12 @@ def main():
...
@@ -286,15 +286,12 @@ def main():
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
)
)
if
data_args
.
max_seq_length
is
None
:
if
data_args
.
max_seq_length
>
tokenizer
.
model_max_length
:
max_seq_length
=
tokenizer
.
model_max_length
logger
.
warn
(
else
:
f
"The max_seq_length passed (
{
data_args
.
max_seq_length
}
) is larger than the maximum length for the"
if
data_args
.
max_seq_length
>
tokenizer
.
model_max_length
:
f
"model (
{
tokenizer
.
model_max_length
}
). Using max_seq_length=
{
tokenizer
.
model_max_length
}
."
logger
.
warn
(
)
f
"The max_seq_length passed (
{
data_args
.
max_seq_length
}
) is larger than the maximum length for the"
max_seq_length
=
min
(
data_args
.
max_seq_length
,
tokenizer
.
model_max_length
)
f
"model (
{
tokenizer
.
model_max_length
}
). Using max_seq_length=
{
tokenizer
.
model_max_length
}
."
)
max_seq_length
=
min
(
data_args
.
max_seq_length
,
tokenizer
.
model_max_length
)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# max_seq_length.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment