Fix max length in run_plm script (#8738)

367f497d · Sylvain Gugger · GitHub · e84786aa · 367f497d
Unverified Commit 367f497d authored Nov 23, 2020 by Sylvain Gugger Committed by GitHub Nov 23, 2020
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 12 deletions

examples/language-modeling/run_plm.py examples/language-modeling/run_plm.py +9 -12

No files found.
--- a/examples/language-modeling/run_plm.py
+++ b/examples/language-modeling/run_plm.py
@@ -93,11 +93,11 @@ class DataTrainingArguments:
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
-    max_seq_length: Optional[int] = field(
-        default=None,
+    max_seq_length: int = field(
+        default=512,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated. Default to the max input length of the model."
+            "than this will be truncated."
        },
    )
    preprocessing_num_workers: Optional[int] = field(
@@ -286,9 +286,6 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
        )

-        if data_args.max_seq_length is None:
-            max_seq_length = tokenizer.model_max_length
-        else:
        if data_args.max_seq_length > tokenizer.model_max_length:
            logger.warn(
                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"