[Fix] Fix the bug that the training log and evaluating log are mixed (#1252)

* [Fix] Fix the bug that training log and evaluating log are mixed * [Fix] Fix the bug that training log and evaluating log are mixed * fix comment * fix import error * refactor * refactor * refactor * clear log_buffer before evaluation * fix error * add unittest

[Fix] Fix the bug that the training log and evaluating log are mixed (#1252)
* [Fix] Fix the bug that training log and evaluating log are mixed * [Fix] Fix the bug that training log and evaluating log are mixed * fix comment * fix import error * refactor * refactor * refactor * clear log_buffer before evaluation * fix error * add unittest
846d3a4a · Zaida Zhou · GitHub · 18c64d5f · 846d3a4a · 846d3a4a
Unverified Commit 846d3a4a authored Aug 11, 2021 by Zaida Zhou Committed by GitHub Aug 11, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 60 additions and 17 deletions

mmcv/runner/hooks/evaluation.py mmcv/runner/hooks/evaluation.py +18 -8

tests/test_runner/test_eval_hook.py tests/test_runner/test_eval_hook.py +42 -9

No files found.
--- a/mmcv/runner/hooks/evaluation.py
+++ b/mmcv/runner/hooks/evaluation.py
@@ -10,6 +10,7 @@ from torch.utils.data import DataLoader

 from mmcv.utils import is_seq_of
 from .hook import Hook
+from .logger import LoggerHook


 class EvalHook(Hook):
@@ -212,19 +213,31 @@ class EvalHook(Hook):

    def after_train_iter(self, runner):
        """Called after every training iter to evaluate the results."""
-        if not self.by_epoch:
+        if not self.by_epoch and self._should_evaluate(runner):
+            # Because the priority of EvalHook is higher than LoggerHook, the
+            # training log and the evaluating log are mixed. Therefore,
+            # we need to dump the training log and clear it before evaluating
+            # log is generated. In addition, this problem will only appear in
+            # `IterBasedRunner` whose `self.by_epoch` is False, because
+            # `EpochBasedRunner` whose `self.by_epoch` is True calls
+            # `_do_evaluate` in `after_train_epoch` stage, and at this stage
+            # the training log has been printed, so it will not cause any
+            # problem. more details at
+            # https://github.com/open-mmlab/mmsegmentation/issues/694
+            for hook in runner._hooks:
+                if isinstance(hook, LoggerHook):
+                    hook.after_train_iter(runner)
+            runner.log_buffer.clear()
+
            self._do_evaluate(runner)

    def after_train_epoch(self, runner):
        """Called after every training epoch to evaluate the results."""
-        if self.by_epoch:
+        if self.by_epoch and self._should_evaluate(runner):
            self._do_evaluate(runner)

    def _do_evaluate(self, runner):
        """perform evaluation and save ckpt."""
-        if not self._should_evaluate(runner):
-            return
-
        results = self.test_fn(runner.model, self.dataloader)
        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
        key_score = self.evaluate(runner, results)
@@ -419,9 +432,6 @@ class DistEvalHook(EvalHook):
                    dist.broadcast(module.running_var, 0)
                    dist.broadcast(module.running_mean, 0)

-        if not self._should_evaluate(runner):
-            return
-
        tmpdir = self.tmpdir
        if tmpdir is None:
            tmpdir = osp.join(runner.work_dir, '.eval_hook')

--- a/tests/test_runner/test_eval_hook.py
+++ b/tests/test_runner/test_eval_hook.py
+import json
 import os.path as osp
 import tempfile
 import unittest.mock as mock
@@ -7,13 +8,14 @@ from unittest.mock import MagicMock, patch
 import pytest
 import torch
 import torch.nn as nn
+import torch.optim as optim
 from torch.utils.data import DataLoader, Dataset

 from mmcv.runner import DistEvalHook as BaseDistEvalHook
 from mmcv.runner import EpochBasedRunner
 from mmcv.runner import EvalHook as BaseEvalHook
 from mmcv.runner import IterBasedRunner
-from mmcv.utils import get_logger
+from mmcv.utils import get_logger, scandir


 class ExampleDataset(Dataset):
@@ -48,18 +50,16 @@ class Model(nn.Module):

    def __init__(self):
        super().__init__()
-        self.linear = nn.Linear(2, 1)
+        self.param = nn.Parameter(torch.tensor([1.0]))

    def forward(self, x, **kwargs):
-        return x
+        return self.param * x

    def train_step(self, data_batch, optimizer, **kwargs):
-        if not isinstance(data_batch, dict):
-            data_batch = dict(x=data_batch)
-        return data_batch
+        return {'loss': torch.sum(self(data_batch['x']))}

-    def val_step(self, x, optimizer, **kwargs):
-        return dict(loss=self(x))
+    def val_step(self, data_batch, optimizer, **kwargs):
+        return {'loss': torch.sum(self(data_batch['x']))}


 def _build_epoch_runner():
@@ -307,7 +307,7 @@ def test_eval_hook():
                          (_build_iter_runner, False)])
 def test_start_param(EvalHookParam, _build_demo_runner, by_epoch):
    # create dummy data
-    dataloader = DataLoader(torch.ones((5, 2)))
+    dataloader = DataLoader(EvalDataset())

    # 0.1. dataloader is not a DataLoader object
    with pytest.raises(TypeError):
@@ -389,3 +389,36 @@ def test_start_param(EvalHookParam, _build_demo_runner, by_epoch):
        runner._iter = 1
    runner.run([dataloader], [('train', 1)], 3)
    assert evalhook.evaluate.call_count == 2  # after epoch 2 & 3
+
+
+@pytest.mark.parametrize('runner,by_epoch,eval_hook_priority',
+                         [(EpochBasedRunner, True, 'NORMAL'),
+                          (EpochBasedRunner, True, 'LOW'),
+                          (IterBasedRunner, False, 'LOW')])
+def test_logger(runner, by_epoch, eval_hook_priority):
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(
+        data_loader, interval=1, by_epoch=by_epoch, save_best='acc')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_logger')
+        optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+        runner = EpochBasedRunner(
+            model=model, optimizer=optimizer, work_dir=tmpdir, logger=logger)
+        runner.register_logger_hooks(
+            dict(
+                interval=1,
+                hooks=[dict(type='TextLoggerHook', by_epoch=by_epoch)]))
+        runner.register_timer_hook(dict(type='IterTimerHook'))
+        runner.register_hook(eval_hook, priority=eval_hook_priority)
+        runner.run([loader], [('train', 1)], 1)
+
+        path = osp.join(tmpdir, next(scandir(tmpdir, '.json')))
+        with open(path) as fr:
+            fr.readline()  # skip first line which is hook_msg
+            train_log = json.loads(fr.readline())
+            assert train_log['mode'] == 'train' and 'time' in train_log
+            val_log = json.loads(fr.readline())
+            assert val_log['mode'] == 'val' and 'time' not in val_log