#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

import os
import unittest

import d2go.runner.default_runner as default_runner
import torch
from d2go.registry.builtin import META_ARCH_REGISTRY
from d2go.utils.testing.data_loader_helper import create_local_dataset
from d2go.utils.testing.helper import tempdir
from detectron2.structures import ImageList


TEST_CUDA: bool = torch.cuda.is_available()


@META_ARCH_REGISTRY.register()
class MetaArchForTestSimple(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.conv = torch.nn.Conv2d(3, 4, kernel_size=3, stride=1, padding=1)
        self.bn = torch.nn.BatchNorm2d(4)
        self.relu = torch.nn.ReLU(inplace=True)
        self.avgpool = torch.nn.AdaptiveAvgPool2d((1, 1))

    @property
    def device(self):
        return self.conv.weight.device

    def forward(self, inputs):
        images = [x["image"] for x in inputs]
        images = ImageList.from_tensors(images, 1).to(self.device)
        ret = self.conv(images.tensor)
        ret = self.bn(ret)
        ret = self.relu(ret)
        ret = self.avgpool(ret)
        return {"loss": ret.norm()}


def train_with_memory_profiler(output_dir, device="cpu"):
    ds_name = create_local_dataset(output_dir, 5, 10, 10)

    runner = default_runner.Detectron2GoRunner()
    cfg = runner.get_default_cfg()

    cfg.MODEL.DEVICE = device
    cfg.MODEL.META_ARCHITECTURE = "MetaArchForTestSimple"
    cfg.SOLVER.MAX_ITER = 10
    cfg.DATASETS.TRAIN = (ds_name,)
    cfg.DATASETS.TEST = (ds_name,)
    cfg.OUTPUT_DIR = output_dir
    cfg.MEMORY_PROFILER.ENABLED = True
    cfg.MEMORY_PROFILER.LOG_N_STEPS = 3
    cfg.MEMORY_PROFILER.LOG_DURING_TRAIN_AT = 5

    # Register configs
    runner.register(cfg)

    # Create dummy data to pass to wrapper
    model = runner.build_model(cfg)
    runner.do_train(cfg, model, resume=True)
    return cfg


class TestGPUMemoryProfiler(unittest.TestCase):
    @tempdir
    def test_gpu_memory_profiler_no_gpu(self, tmp_dir: str):
        # GPU memory profiler should silently pass if no CUDA is available
        train_with_memory_profiler(tmp_dir, device="cpu")

    @tempdir
    @unittest.skipIf(not TEST_CUDA, "no CUDA detected")
    def test_gpu_memory_profiler_with_gpu(self, tmp_dir: str):
        cfg = train_with_memory_profiler(tmp_dir, device="cuda")
        n = cfg.MEMORY_PROFILER.LOG_N_STEPS
        s = cfg.MEMORY_PROFILER.LOG_DURING_TRAIN_AT

        save_dir = os.path.join(tmp_dir, "memory_snapshot")
        self.assertTrue(os.path.exists(save_dir))
        for i in [n - 1, s + n - 1]:
            trace_dir = os.path.join(save_dir, f"iter{i}_rank0")
            self.assertTrue(os.path.exists(trace_dir))
            self.assertTrue(os.path.exists(os.path.join(trace_dir, "snapshot.pickle")))
            self.assertTrue(os.path.exists(os.path.join(trace_dir, "trace_plot.html")))
            self.assertTrue(
                os.path.exists(os.path.join(trace_dir, "segment_plot.html"))
            )