Commit b8513677 authored by Cao Yuhang's avatar Cao Yuhang Committed by Kai Chen
Browse files

fix bug that stuck at evaluation (#53)

* fix bug that stuck at evaluation

* remove mode variable
parent bef7c13a
...@@ -16,6 +16,15 @@ class TextLoggerHook(LoggerHook): ...@@ -16,6 +16,15 @@ class TextLoggerHook(LoggerHook):
super(TextLoggerHook, self).before_run(runner) super(TextLoggerHook, self).before_run(runner)
self.start_iter = runner.iter self.start_iter = runner.iter
def _get_max_memory(self, runner):
mem = torch.cuda.max_memory_allocated()
mem_mb = torch.tensor([mem / (1024 * 1024)],
dtype=torch.int,
device=torch.device('cuda'))
if runner.world_size > 1:
dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
return mem_mb
def log(self, runner): def log(self, runner):
if runner.mode == 'train': if runner.mode == 'train':
lr_str = ', '.join( lr_str = ', '.join(
...@@ -38,13 +47,9 @@ class TextLoggerHook(LoggerHook): ...@@ -38,13 +47,9 @@ class TextLoggerHook(LoggerHook):
'time: {log[time]:.3f}, data_time: {log[data_time]:.3f}, '. 'time: {log[time]:.3f}, data_time: {log[data_time]:.3f}, '.
format(log=runner.log_buffer.output)) format(log=runner.log_buffer.output))
# statistic memory # statistic memory
if runner.mode == 'train' and torch.cuda.is_available(): # training mode if the output contains the key "time"
mem = torch.cuda.max_memory_allocated() if 'time' in runner.log_buffer.output and torch.cuda.is_available():
mem_mb = torch.tensor([mem / (1024 * 1024)], mem_mb = self._get_max_memory(runner)
dtype=torch.int,
device=torch.device('cuda'))
if runner.world_size > 1:
dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
log_str += 'memory: {}, '.format(mem_mb.item()) log_str += 'memory: {}, '.format(mem_mb.item())
log_items = [] log_items = []
for name, val in runner.log_buffer.output.items(): for name, val in runner.log_buffer.output.items():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment