fix attribute mismatch for memory profiler

Summary: Pull Request resolved: https://github.com/facebookresearch/d2go/pull/544 The previous diff on memory profiler D45673764 doesn't pick up a config key name change and causes an attribute not found error. This diff fixes it and adds two unittests (one with gpu one without) for using memory profiler in runner Reviewed By: wat3rBro Differential Revision: D46114730 fbshipit-source-id: d066d435021983d90f4a75e0c88798a3aedcaf92

fix attribute mismatch for memory profiler
Summary: Pull Request resolved: https://github.com/facebookresearch/d2go/pull/544 The previous diff on memory profiler D45673764 doesn't pick up a config key name change and causes an attribute not found error. This diff fixes it and adds two unittests (one with gpu one without) for using memory profiler in runner Reviewed By: wat3rBro Differential Revision: D46114730 fbshipit-source-id: d066d435021983d90f4a75e0c88798a3aedcaf92
99c65490 · Anthony Chen · Facebook GitHub Bot · 2526b053 · 99c65490 · 99c65490
Commit 99c65490 authored May 24, 2023 by Anthony Chen Committed by Facebook GitHub Bot May 24, 2023
3 changed files
--- a/d2go/utils/gpu_memory_profiler.py
+++ b/d2go/utils/gpu_memory_profiler.py
@@ -15,7 +15,7 @@ def add_memory_profiler_configs(_C: CN):
    _C.MEMORY_PROFILER = CN()
    _C.MEMORY_PROFILER.ENABLED = False
    # max number of trace entries in memory snapshot
-    _C.MEMORY_PROFILER.MAX_ENTRIES = 1000000
+    _C.MEMORY_PROFILER.TRACE_MAX_ENTRIES = 1000000
    # Configs to be used by d2go.utils.gpu_memory_profiler.D2GoGpuMemorySnapshot
    # determine the number of iterations to log memory snapshots for
    _C.MEMORY_PROFILER.LOG_N_STEPS = 3

--- a/tests/runner/test_runner_default_runner.py
+++ b/tests/runner/test_runner_default_runner.py
@@ -40,7 +40,7 @@ class MetaArchForTest(torch.nn.Module):
            return self.inference(inputs)
        images = [x["image"] for x in inputs]
-        images = ImageList.from_tensors(images, 1)
+        images = ImageList.from_tensors(images, 1).to(self.device)
        ret = self.conv(images.tensor)
        ret = self.bn(ret)
        ret = self.relu(ret)

--- a/tests/utils/test_gpu_memory_profiler.py
+++ b/tests/utils/test_gpu_memory_profiler.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import os
+import unittest
+import d2go.runner.default_runner as default_runner
+import torch
+from d2go.registry.builtin import META_ARCH_REGISTRY
+from d2go.utils.testing.data_loader_helper import create_local_dataset
+from d2go.utils.testing.helper import tempdir
+from detectron2.structures import ImageList
+TEST_CUDA: bool = torch.cuda.is_available()
+@META_ARCH_REGISTRY.register()
+class MetaArchForTestSimple(torch.nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 4, kernel_size=3, stride=1, padding=1)
+        self.bn = torch.nn.BatchNorm2d(4)
+        self.relu = torch.nn.ReLU(inplace=True)
+        self.avgpool = torch.nn.AdaptiveAvgPool2d((1, 1))
+    @property
+    def device(self):
+        return self.conv.weight.device
+    def forward(self, inputs):
+        images = [x["image"] for x in inputs]
+        images = ImageList.from_tensors(images, 1).to(self.device)
+        ret = self.conv(images.tensor)
+        ret = self.bn(ret)
+        ret = self.relu(ret)
+        ret = self.avgpool(ret)
+        return {"loss": ret.norm()}
+def train_with_memory_profiler(output_dir, device="cpu"):
+    ds_name = create_local_dataset(output_dir, 5, 10, 10)
+    runner = default_runner.Detectron2GoRunner()
+    cfg = runner.get_default_cfg()
+    cfg.MODEL.DEVICE = device
+    cfg.MODEL.META_ARCHITECTURE = "MetaArchForTestSimple"
+    cfg.SOLVER.MAX_ITER = 10
+    cfg.DATASETS.TRAIN = (ds_name,)
+    cfg.DATASETS.TEST = (ds_name,)
+    cfg.OUTPUT_DIR = output_dir
+    cfg.MEMORY_PROFILER.ENABLED = True
+    cfg.MEMORY_PROFILER.LOG_N_STEPS = 3
+    cfg.MEMORY_PROFILER.LOG_DURING_TRAIN_AT = 5
+    # Register configs
+    runner.register(cfg)
+    # Create dummy data to pass to wrapper
+    model = runner.build_model(cfg)
+    runner.do_train(cfg, model, resume=True)
+    return cfg
+class TestGPUMemoryProfiler(unittest.TestCase):
+    @tempdir
+    def test_gpu_memory_profiler_no_gpu(self, tmp_dir: str):
+        # GPU memory profiler should silently pass if no CUDA is available
+        train_with_memory_profiler(tmp_dir, device="cpu")
+    @tempdir
+    @unittest.skipIf(not TEST_CUDA, "no CUDA detected")
+    def test_gpu_memory_profiler_with_gpu(self, tmp_dir: str):
+        cfg = train_with_memory_profiler(tmp_dir, device="cuda")
+        n = cfg.MEMORY_PROFILER.LOG_N_STEPS
+        s = cfg.MEMORY_PROFILER.LOG_DURING_TRAIN_AT
+        save_dir = os.path.join(tmp_dir, "memory_snapshot")
+        self.assertTrue(os.path.exists(save_dir))
+        for i in [n - 1, s + n - 1]:
+            trace_dir = os.path.join(save_dir, f"iter{i}_rank0")
+            self.assertTrue(os.path.exists(trace_dir))
+            self.assertTrue(os.path.exists(os.path.join(trace_dir, "snapshot.pickle")))
+            self.assertTrue(os.path.exists(os.path.join(trace_dir, "trace_plot.html")))
+            self.assertTrue(
+                os.path.exists(os.path.join(trace_dir, "segment_plot.html"))
+            )