Add possibility to switch between APEX and AMP in Trainer (#9137)

* Add possibility to switch between APEX and AMP in Trainer * Update src/transformers/training_args.py Co-authored-by: Stas Bekman <stas00@users.noreply.github.com> * Address review comments * Update src/transformers/training_args.py Co-authored-by: Stas Bekman <stas00@users.noreply.github.com> Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>

Add possibility to switch between APEX and AMP in Trainer (#9137)
* Add possibility to switch between APEX and AMP in Trainer * Update src/transformers/training_args.py Co-authored-by: Stas Bekman <stas00@users.noreply.github.com> * Address review comments * Update src/transformers/training_args.py Co-authored-by: Stas Bekman <stas00@users.noreply.github.com> Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>
ad895af9 · Sylvain Gugger · GitHub · 0b2f46fa · ad895af9 · ad895af9
Unverified Commit ad895af9 authored Dec 15, 2020 by Sylvain Gugger Committed by GitHub Dec 15, 2020
Showing with 68 additions and 51 deletions

src/transformers/trainer.py src/transformers/trainer.py +31 -26

src/transformers/training_args.py src/transformers/training_args.py +8 -0

tests/test_trainer.py tests/test_trainer.py +29 -25

No files found.
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -93,8 +93,7 @@ from .training_args import TrainingArguments
 from .utils import logging


-_use_native_amp = False
-_use_apex = False
+_is_native_amp_available = False

 DEFAULT_CALLBACKS = [DefaultFlowCallback]
 DEFAULT_PROGRESS_CALLBACK = ProgressCallback
@@ -110,16 +109,10 @@ if version.parse(torch.__version__) < version.parse("1.6"):

    if is_apex_available():
        from apex import amp
-    _use_apex = True
 else:
-    _use_native_amp = True
+    _is_native_amp_available = True
    from torch.cuda.amp import autocast

-if version.parse(torch.__version__) < version.parse("1.2"):
-    _use_ddp_no_sync = False
-else:
-    _use_ddp_no_sync = True
-
 if is_datasets_available():
    import datasets

@@ -292,13 +285,30 @@ class Trainer:
            if isinstance(eval_dataset, datasets.Dataset):
                self._remove_unused_columns(self.eval_dataset, description="evaluation")

+        # Mixed precision setup
+        self.use_apex = False
+        self.use_amp = False
+        if args.fp16:
+            if args.fp16_backend == "auto":
+                backend = "amp" if _is_native_amp_available else "apex"
+            else:
+                backend = args.fp16_backend
+
+            if backend == "amp":
+                self.use_amp = True
+                self.scaler = torch.cuda.amp.GradScaler()
+            else:
+                if not is_apex_available():
+                    raise ImportError(
+                        "Using FP16 with APEX but APEX is not installed, please refer to https://www.github.com/nvidia/apex."
+                    )
+                self.use_apex = True
+
        self.state = TrainerState()
        self.control = TrainerControl()
        # Internal variable for total_flos used to count as tensors (for distributed + TPU), will be sent in the
        # state at each call to self.log.
        self._total_flos = None
-        if self.args.fp16 and _use_native_amp:
-            self.scaler = torch.cuda.amp.GradScaler()
        self.hp_search_backend = None
        self.use_tune_checkpoints = False
        default_label_names = (
@@ -625,9 +635,7 @@ class Trainer:

        # Mixed precision training with apex (torch < 1.6)
        model = self.model
-        if self.args.fp16 and _use_apex:
-            if not is_apex_available():
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        if self.use_apex:
            model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level)

        # Multi-gpu training (should be after apex fp16 initialization)
@@ -756,11 +764,8 @@ class Trainer:
                if (step + 1) % self.args.gradient_accumulation_steps == 0:
                    self.control = self.callback_handler.on_step_begin(self.args, self.state, self.control)

-                if (
-                    ((step + 1) % self.args.gradient_accumulation_steps != 0)
-                    and self.args.local_rank != -1
-                    and _use_ddp_no_sync
-                ):
+                if ((step + 1) % self.args.gradient_accumulation_steps != 0) and self.args.local_rank != -1:
+                    # Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
                    with model.no_sync():
                        tr_loss += self.training_step(model, inputs)
                else:
@@ -772,17 +777,17 @@ class Trainer:
                    steps_in_epoch <= self.args.gradient_accumulation_steps
                    and (step + 1) == steps_in_epoch
                ):
-                    if self.args.fp16 and _use_native_amp:
+                    if self.use_amp:
                        self.scaler.unscale_(self.optimizer)
                        torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm)
-                    elif self.args.fp16 and _use_apex:
+                    elif self.use_apex:
                        torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm)

                    if is_torch_tpu_available():
                        xm.optimizer_step(self.optimizer)
-                    elif self.args.fp16 and _use_native_amp:
+                    elif self.use_amp:
                        self.scaler.step(self.optimizer)
                        self.scaler.update()
                    else:
@@ -1089,7 +1094,7 @@ class Trainer:
        model.train()
        inputs = self._prepare_inputs(inputs)

-        if self.args.fp16 and _use_native_amp:
+        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
@@ -1101,9 +1106,9 @@ class Trainer:
        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

-        if self.args.fp16 and _use_native_amp:
+        if self.use_amp:
            self.scaler.scale(loss).backward()
-        elif self.args.fp16 and _use_apex:
+        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
@@ -1498,7 +1503,7 @@ class Trainer:
                ignore_keys = []

        with torch.no_grad():
-            if self.args.fp16 and _use_native_amp:
+            if self.use_amp:
                with autocast():
                    outputs = model(**inputs)
            else:

--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -211,6 +211,10 @@ class TrainingArguments:
            When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
            stage as in the previous training. If set to :obj:`True`, the training will begin faster (as that skipping
            step can take a long time) but will not yield the same results as the interrupted training would have.
+        fp16_backend (:obj:`str`, `optional`, defaults to :obj:`"auto"`):
+            The backend to use for mixed precision training. Must be one of :obj:`"auto"`, :obj:`"amp"` or
+            :obj:`"apex"`. :obj:`"auto"` will use AMP or APEX depending on the PyTorch version detected, while the
+            other choices will force the requested backend.
    """

    output_dir: str = field(
@@ -378,6 +382,10 @@ class TrainingArguments:
            "help": "When resuming training, whether or not to skip the first epochs and batches to get to the same training data."
        },
    )
+    fp16_backend: str = field(
+        default="auto",
+        metadata={"help": "The backend to be used for mixed precision. Should be one of 'auto', 'amp' or 'apex'."},
+    )

    def __post_init__(self):
        if self.disable_tqdm is None:

--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -798,34 +798,38 @@ class TrainerIntegrationTest(unittest.TestCase):

    def test_early_stopping_callback(self):
        # early stopping stops training before num_training_epochs
-        trainer = get_regression_trainer(
-            num_train_epochs=20,
-            gradient_accumulation_steps=1,
-            per_device_train_batch_size=16,
-            load_best_model_at_end=True,
-            evaluation_strategy=EvaluationStrategy.EPOCH,
-            compute_metrics=AlmostAccuracy(),
-            metric_for_best_model="accuracy",
-        )
-        trainer.add_callback(EarlyStoppingCallback(1, 0.0001))
-        train_output = trainer.train()
-        self.assertLess(train_output.global_step, 20 * 64 / 16)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                output_dir=tmp_dir,
+                num_train_epochs=20,
+                gradient_accumulation_steps=1,
+                per_device_train_batch_size=16,
+                load_best_model_at_end=True,
+                evaluation_strategy=EvaluationStrategy.EPOCH,
+                compute_metrics=AlmostAccuracy(),
+                metric_for_best_model="accuracy",
+            )
+            trainer.add_callback(EarlyStoppingCallback(1, 0.0001))
+            train_output = trainer.train()
+            self.assertLess(train_output.global_step, 20 * 64 / 16)

        # Invalid inputs to trainer with early stopping callback result in assertion error
-        trainer = get_regression_trainer(
-            num_train_epochs=20,
-            gradient_accumulation_steps=1,
-            per_device_train_batch_size=16,
-            evaluation_strategy=EvaluationStrategy.EPOCH,
-            compute_metrics=AlmostAccuracy(),
-            metric_for_best_model="accuracy",
-        )
-        trainer.add_callback(EarlyStoppingCallback(1))
-        self.assertEqual(trainer.state.global_step, 0)
-        try:
-            trainer.train()
-        except AssertionError:
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                output_dir=tmp_dir,
+                num_train_epochs=20,
+                gradient_accumulation_steps=1,
+                per_device_train_batch_size=16,
+                evaluation_strategy=EvaluationStrategy.EPOCH,
+                compute_metrics=AlmostAccuracy(),
+                metric_for_best_model="accuracy",
+            )
+            trainer.add_callback(EarlyStoppingCallback(1))
            self.assertEqual(trainer.state.global_step, 0)
+            try:
+                trainer.train()
+            except AssertionError:
+                self.assertEqual(trainer.state.global_step, 0)

    def test_flos_extraction(self):
        trainer = get_regression_trainer(learning_rate=0.1)