Add option to set max_len in run_ner (#12929)

fd85734e · Sylvain Gugger · GitHub · 1486fb81 · fd85734e
Unverified Commit fd85734e authored Jul 28, 2021 by Sylvain Gugger Committed by GitHub Jul 28, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 0 deletions

examples/pytorch/token-classification/run_ner.py examples/pytorch/token-classification/run_ner.py +8 -0

No files found.
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -123,6 +123,13 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
+    max_seq_length: int = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. If set, sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
@@ -358,6 +365,7 @@ def main():
            examples[text_column_name],
            padding=padding,
            truncation=True,
+            max_length=data_args.max_seq_length,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )