add preprocessing_num_workers to run_classification.py (#31586)

preprocessing_num_workers option to speedup preprocess

add preprocessing_num_workers to run_classification.py (#31586)
preprocessing_num_workers option to speedup preprocess
e73a97a2 · Locke · GitHub · fc689d75 · e73a97a2
Unverified Commit e73a97a2 authored Jun 25, 2024 by Locke Committed by GitHub Jun 25, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 0 deletions

examples/pytorch/text-classification/run_classification.py examples/pytorch/text-classification/run_classification.py +5 -0

No files found.
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -133,6 +133,10 @@ class DataTrainingArguments:
            )
        },
    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
@@ -573,6 +577,7 @@ def main():
        raw_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on dataset",
        )