# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.


import pytest
import torch
import transformer_engine.pytorch as te
import time

import nvdlfw_inspect.api as debug_api

from transformer_engine.debug.pytorch.debug_state import TEDebugState


def _run_cpu_overhead(debug_tools_initialized, layer, configs_dir, feature_dirs):
    debug_api.end_debug()
    TEDebugState._reset()
    if debug_tools_initialized:
        # This config log stats starting from 0, every N iterations for huge N >> NUM_ITERS.
        # So after 1 warm-up iteration, this layers should work in non-debug mode.
        debug_api.initialize(
            config_file=configs_dir + "/perf_config.yaml", feature_dirs=feature_dirs
        )

    try:
        if layer == "linear":
            model = torch.nn.Sequential(
                te.Linear(1, 1, name="linear1"), te.Linear(1, 1, name="linear2")
            ).cuda()
            NUM_ITERS = 1800
        elif layer == "transformer":
            model = torch.nn.Sequential(
                te.TransformerLayer(1, 1, 1, name="transformer1"),
                te.TransformerLayer(1, 1, 1, name="transformer2"),
            ).cuda()
            NUM_ITERS = 200

        NUM_INVOCATIONS_PER_ITER = 10

        x = torch.randn(1, 1, 1).cuda()

        y = model(x)
        y.sum().backward()
        debug_api.step()
        torch.cuda.synchronize()

        time_start = time.time()
        for i in range(NUM_ITERS):
            for _ in range(NUM_INVOCATIONS_PER_ITER):
                y = model(x)
                y.sum().backward()
            if debug_tools_initialized:
                debug_api.step()
        torch.cuda.synchronize()
        time_end = time.time()

    finally:
        if debug_tools_initialized:
            debug_api.end_debug()

    return time_end - time_start


@pytest.mark.parametrize("layer", ["linear", "transformer"])
def test_cpu_overhead(layer, configs_dir, feature_dirs):
    # runs one layer many times on very small tensor
    # - gpu time should be negligible, so time should be dominated by cpu time.
    # if layers does not invoke any feature in current iteration,
    # then it changed into non-debug mode and should not have any non-negligible cpu overhead
    # compared to layer without debug tools initialized.

    with_debug_tools = _run_cpu_overhead(True, layer, configs_dir, feature_dirs)
    without_debug_tools = _run_cpu_overhead(False, layer, configs_dir, feature_dirs)

    print(f"with_debug_tools: {with_debug_tools} s")
    print(f"without_debug_tools: {without_debug_tools} s")

    assert with_debug_tools < without_debug_tools * 1.25  # 25% overhead margin