Commit c0a84df5 authored by Anthony Chen's avatar Anthony Chen Committed by Facebook GitHub Bot
Browse files

disable memory profiler by default + remove force disable + add logging

Summary: Pull Request resolved: https://github.com/facebookresearch/d2go/pull/581

Reviewed By: wat3rBro

Differential Revision: D46913792

fbshipit-source-id: cf3c3812c455091fbf63842443644d2571976017
parent 7f17bbf0
...@@ -343,8 +343,7 @@ class Detectron2GoRunner(D2GoDataAPIMixIn, BaseRunner): ...@@ -343,8 +343,7 @@ class Detectron2GoRunner(D2GoDataAPIMixIn, BaseRunner):
def build_model(self, cfg, eval_only=False): def build_model(self, cfg, eval_only=False):
# Attach memory profiler to GPU OOM events # Attach memory profiler to GPU OOM events
# Disabled since it can cause ranks to die if cfg.get("MEMORY_PROFILER", CfgNode()).get("ENABLED", False):
if False and cfg.get("MEMORY_PROFILER", CfgNode()).get("ENABLED", False):
attach_oom_logger( attach_oom_logger(
cfg.OUTPUT_DIR, trace_max_entries=cfg.MEMORY_PROFILER.TRACE_MAX_ENTRIES cfg.OUTPUT_DIR, trace_max_entries=cfg.MEMORY_PROFILER.TRACE_MAX_ENTRIES
) )
......
...@@ -51,6 +51,9 @@ class D2GoGpuMemorySnapshot(HookBase): ...@@ -51,6 +51,9 @@ class D2GoGpuMemorySnapshot(HookBase):
self.log_n_steps = log_n_steps self.log_n_steps = log_n_steps
self.log_during_train_at = log_during_train_at self.log_during_train_at = log_during_train_at
self.trace_max_entries = trace_max_entries self.trace_max_entries = trace_max_entries
logger.warning(
"WARNING: Memory snapshot profiler is enabled. This may cause ranks to die and training jobs to get stuck. Please use with caution."
)
def before_step(self): def before_step(self):
if self.trainer.iter == self.log_during_train_at: if self.trainer.iter == self.log_during_train_at:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment