"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "2351729f7d1c606624dcd2d1ad5dc8e627e17320"
Unverified Commit cf0755aa authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

[debug] DebugUnderflowOverflow doesn't work with DP (#12816)

parent ac3cb660
...@@ -24,7 +24,11 @@ Underflow and Overflow Detection ...@@ -24,7 +24,11 @@ Underflow and Overflow Detection
.. note:: .. note::
This feature can be used with any ``nn.Module``-based model For multi-GPU training it requires DDP (``torch.distributed.launch``).
.. note::
This feature can be used with any ``nn.Module``-based model.
If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` in If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` in
activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
......
...@@ -1114,7 +1114,14 @@ class Trainer: ...@@ -1114,7 +1114,14 @@ class Trainer:
num_train_samples = args.max_steps * total_train_batch_size num_train_samples = args.max_steps * total_train_batch_size
if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
debug_overflow = DebugUnderflowOverflow(self.model) # noqa if self.args.n_gpu > 1:
# nn.DataParallel(model) replicates the model, creating new variables and module
# references registered here no longer work on other gpus, breaking the module
raise ValueError(
"Currently --debug underflow_overflow is not supported under DP. Please use DDP (torch.distributed.launch)."
)
else:
debug_overflow = DebugUnderflowOverflow(self.model) # noqa
delay_optimizer_creation = self.sharded_ddp is not None and self.sharded_ddp != ShardedDDPOption.SIMPLE delay_optimizer_creation = self.sharded_ddp is not None and self.sharded_ddp != ShardedDDPOption.SIMPLE
if args.deepspeed: if args.deepspeed:
......
...@@ -420,7 +420,7 @@ class TrainerMemoryTracker: ...@@ -420,7 +420,7 @@ class TrainerMemoryTracker:
self.cur_stage = None self.cur_stage = None
def update_metrics(self, stage, metrics): def update_metrics(self, stage, metrics):
"""stop tracking for the passed stage""" """updates the metrics"""
if self.skip_memory_metrics: if self.skip_memory_metrics:
return return
...@@ -442,7 +442,7 @@ class TrainerMemoryTracker: ...@@ -442,7 +442,7 @@ class TrainerMemoryTracker:
metrics[f"{stage}_mem_gpu_{t}_delta"] = self.gpu[stage][t] metrics[f"{stage}_mem_gpu_{t}_delta"] = self.gpu[stage][t]
def stop_and_update_metrics(self, metrics=None): def stop_and_update_metrics(self, metrics=None):
"""combine stop + update in one call for simpler code""" """combine stop and metrics update in one call for simpler code"""
if self.skip_memory_metrics: if self.skip_memory_metrics:
return return
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment