[debug] DebugUnderflowOverflow doesn't work with DP (#12816)

cf0755aa · Stas Bekman · GitHub · ac3cb660 · cf0755aa · cf0755aa
Unverified Commit cf0755aa authored Jul 21, 2021 by Stas Bekman Committed by GitHub Jul 21, 2021
Showing with 15 additions and 4 deletions

docs/source/debugging.rst docs/source/debugging.rst +5 -1

src/transformers/trainer.py src/transformers/trainer.py +8 -1

src/transformers/trainer_utils.py src/transformers/trainer_utils.py +2 -2

No files found.
--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@@ -24,7 +24,11 @@ Underflow and Overflow Detection
 .. note::
-   This feature can be used with any ``nn.Module``-based model
+   For multi-GPU training it requires DDP (``torch.distributed.launch``).
+.. note::
+   This feature can be used with any ``nn.Module``-based model.
 If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` in
 activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily

--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1114,6 +1114,13 @@ class Trainer:
            num_train_samples = args.max_steps * total_train_batch_size
        if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
+            if self.args.n_gpu > 1:
+                # nn.DataParallel(model) replicates the model, creating new variables and module
+                # references registered here no longer work on other gpus, breaking the module
+                raise ValueError(
+                    "Currently --debug underflow_overflow is not supported under DP. Please use DDP (torch.distributed.launch)."
+                )
+            else:
                debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
        delay_optimizer_creation = self.sharded_ddp is not None and self.sharded_ddp != ShardedDDPOption.SIMPLE

--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -420,7 +420,7 @@ class TrainerMemoryTracker:
        self.cur_stage = None
    def update_metrics(self, stage, metrics):
-        """stop tracking for the passed stage"""
+        """updates the metrics"""
        if self.skip_memory_metrics:
            return
@@ -442,7 +442,7 @@ class TrainerMemoryTracker:
                    metrics[f"{stage}_mem_gpu_{t}_delta"] = self.gpu[stage][t]
    def stop_and_update_metrics(self, metrics=None):
-        """combine stop + update in one call for simpler code"""
+        """combine stop and metrics update in one call for simpler code"""
        if self.skip_memory_metrics:
            return