Enable split_batches through TrainingArguments (#26798)

* Enable split_batches through TrainingArguments * Extra dispatch_batches * Keep as default false * Add to docstring * Add to docstring * Remove the capturewarnings change * Comma

Enable split_batches through TrainingArguments (#26798)
* Enable split_batches through TrainingArguments * Extra dispatch_batches * Keep as default false * Add to docstring * Add to docstring * Remove the capturewarnings change * Comma
3520e37e · Zach Mueller · GitHub · 95020f20 · 3520e37e · 3520e37e
Unverified Commit 3520e37e authored Nov 01, 2023 by Zach Mueller Committed by GitHub Nov 01, 2023
Show whitespace changes
Inline Side-by-side

Showing with 18 additions and 0 deletions

src/transformers/trainer.py src/transformers/trainer.py +1 -0

src/transformers/training_args.py src/transformers/training_args.py +17 -0

No files found.
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -3906,6 +3906,7 @@ class Trainer:
        # create accelerator object
        self.accelerator = Accelerator(
            dispatch_batches=self.args.dispatch_batches,
+            split_batches=self.args.split_batches,
            deepspeed_plugin=self.args.deepspeed_plugin,
            gradient_accumulation_plugin=gradient_accumulation_plugin,
        )

--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -621,6 +621,14 @@ class TrainingArguments:
            Refer to the PyTorch doc for possible values and note that they may change across PyTorch versions.
            This flag is experimental and subject to change in future releases.
+        split_batches (`bool`, *optional*):
+            Whether or not the accelerator should split the batches yielded by the dataloaders across the devices
+            during distributed training. If
+            set to `True`, the actual batch size used will be the same on any kind of distributed processes, but it
+            must be a
+            round multiple of the number of processes you are using (such as GPUs).
        include_tokens_per_second (`bool`, *optional*):
            Whether or not to compute the number of tokens per second per device for training speed metrics.
@@ -1226,6 +1234,15 @@ class TrainingArguments:
        },
    )
+    split_batches: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether or not the accelerator should split the batches yielded by the dataloaders across the devices during distributed training. If"
+            "set to `True`, the actual batch size used will be the same on any kind of distributed processes, but it must be a"
+            "round multiple of the number of processes you are using (such as GPUs)."
+        },
+    )
    include_tokens_per_second: Optional[bool] = field(
        default=False,
        metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."},