[Pytorch] NVIDIA-DL-Framework-Inspect support – part 3 – tests (#1612)

* tests drop Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move dir Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * tests fox Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> Signed-off-by: Przemek Tredak <ptredak@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Przemek Tredak <ptredak@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

[Pytorch] NVIDIA-DL-Framework-Inspect support – part 3 – tests (#1612)
* tests drop Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move dir Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * tests fox Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> Signed-off-by: Przemek Tredak <ptredak@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Przemek Tredak <ptredak@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
2645eaec · Paweł Gadziński · GitHub · 1d903f5e · 2645eaec · 2645eaec
Unverified Commit 2645eaec authored May 19, 2025 by Paweł Gadziński Committed by GitHub May 19, 2025
20 changed files
--- a/qa/L0_pytorch_debug_unittest/test.sh
+++ b/qa/L0_pytorch_debug_unittest/test.sh
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+
+
+: ${TE_PATH:=/opt/transformerengine}
+: ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
+: ${NVTE_TEST_NVINSPECT_CONFIGS_DIR:=$TE_PATH/tests/pytorch/debug/test_configs/}
+
+# Config with the dummy feature which prevents nvinspect from being disabled.
+# Nvinspect will be disabled if no feature is active.
+: ${NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE:=$TE_PATH/tests/pytorch/debug/test_configs/dummy_feature.yaml}
+
+FAIL=0
+
+pip install pytest==8.2.1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_sanity.py  --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
+NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+
+# standard numerics tests with initialized debug
+NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py || FAIL=1
+
+exit $FAIL
--- a/qa/L1_pytorch_distributed_unittest/test.sh
+++ b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -20,6 +20,7 @@ FAILED_CASES=""
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"

+
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"

 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "test_numerics.py"
@@ -30,6 +31,19 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_use
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fused_attn_with_cp.xml $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py || test_fail "test_fused_attn_with_cp.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"

+
+# debug tests
+
+
+# Config with the dummy feature which prevents nvinspect from being disabled.
+# Nvinspect will be disabled if no feature is active.
+: ${NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE:=$TE_PATH/tests/pytorch/debug/test_configs/dummy_feature.yaml}
+: ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
+
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
+# standard numerics tests with initialized debug
+NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py"
+
 if [ "$RET" -ne 0 ]; then
    echo "Error in the following test cases:$FAILED_CASES"
    exit 1

--- a/tests/pytorch/debug/conftest.py
+++ b/tests/pytorch/debug/conftest.py
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--feature_dirs", nargs="+", action="store", default="", help="List of feature directories"
+    )
+    parser.addoption(
+        "--configs_dir",
+        action="store",
+        default="",
+        type=str,
+        help="Path to the directory with configs.",
+    )
+
+
+@pytest.fixture
+def feature_dirs(request):
+    return request.config.getoption("--feature_dirs")
+
+
+@pytest.fixture
+def configs_dir(request):
+    return request.config.getoption("--configs_dir")
--- a/tests/pytorch/debug/run_distributed.py
+++ b/tests/pytorch/debug/run_distributed.py
--- a/tests/pytorch/debug/test_api_features.py
+++ b/tests/pytorch/debug/test_api_features.py
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import torch
+from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor, Float8Quantizer
+
+import nvdlfw_inspect.api as debug_api
+
+try:
+    import transformer_engine
+    import transformer_engine_torch as tex
+except (ImportError, ModuleNotFoundError):
+    print("Could not find TransformerEngine package.")
+    exit(1)
+
+
+def test_transformer_engine_no_config(feature_dirs):
+    debug_api.initialize("", feature_dirs=feature_dirs)
+    try:
+
+        tensor = torch.rand(24, 2046).cuda()
+
+        # FP8 enabled - true by the default
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="fprop", iteration=0
+        )
+
+        # modify_tensor_enabled - False by default
+        assert not debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0
+        )
+
+        # inspect_tensor_enabled - False by default
+        assert not debug_api.transformer_engine.inspect_tensor_enabled(
+            "decoder.1.attn.qkv", tensor_name="activation", iteration=0
+        )
+
+        # inspect_tensor_postquantize - False by default
+        assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
+            "decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0
+        )
+
+    finally:
+        debug_api.end_debug()
+
+
+def test_disable_fp8_gemm(configs_dir, feature_dirs):
+    try:
+        debug_api.initialize(configs_dir + "disable_fp8_gemms.yaml", feature_dirs=feature_dirs)
+
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="fprop", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="dgrad", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="wgrad", iteration=0
+        )
+
+        # caching
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="fprop", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="dgrad", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="wgrad", iteration=0
+        )
+
+    finally:
+        debug_api.end_debug()
+
+
+def test_disable_fp8_layer(configs_dir, feature_dirs):
+    try:
+        debug_api.initialize(configs_dir + "disable_fp8_layer.yaml", feature_dirs=feature_dirs)
+
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.mlp.fc1", gemm="fprop", iteration=0
+        )
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.mlp.fc1", gemm="wgrad", iteration=0
+        )
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.mlp.fc1", gemm="dgrad", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="fprop", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="wgrad", iteration=0
+        )
+        assert not debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.attn.qkv", gemm="dgrad", iteration=0
+        )
+
+    finally:
+        debug_api.end_debug()
+
+
+def test_per_tensor_scaling(configs_dir, feature_dirs):
+    try:
+
+        debug_api.initialize(configs_dir + "per_tensor_scaling.yaml", feature_dirs=feature_dirs)
+
+        tensor = torch.rand(24, 2046).cuda()
+
+        # check modify_tensor_enabled
+        assert debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="fprop", tensor_name="activation", iteration=0
+        )
+        assert debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="fprop", tensor_name="weight", iteration=0
+        )
+        assert debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="gradient", iteration=0
+        )
+        assert not debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="weight", iteration=0
+        )
+        assert not debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="wgrad", tensor_name="gradient", iteration=0
+        )
+        assert not debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="wgrad", tensor_name="activation", iteration=0
+        )
+
+        # check modify_tensor
+
+        default_quantizer1 = Float8Quantizer(
+            scale=torch.tensor([1]).cuda(),
+            amax=torch.tensor([0]).cuda(),
+            fp8_dtype=tex.DType.kFloat8E4M3,
+        )
+        default_quantizer2 = Float8Quantizer(
+            scale=torch.tensor([1]).cuda(),
+            amax=torch.tensor([0]).cuda(),
+            fp8_dtype=tex.DType.kFloat8E5M2,
+        )
+
+        output1 = debug_api.transformer_engine.modify_tensor(
+            layer_name="decoder.1.mlp.fc1",
+            gemm="fprop",
+            tensor_name="activation",
+            default_quantizer=default_quantizer1,
+            iteration=0,
+            tensor=tensor,
+        )
+        assert type(output1) == Float8Tensor
+        assert output1._fp8_dtype == tex.DType.kFloat8E4M3
+
+        output2 = debug_api.transformer_engine.modify_tensor(
+            "decoder.1.mlp.fc1",
+            gemm="dgrad",
+            tensor=tensor,
+            tensor_name="gradient",
+            default_quantizer=default_quantizer2,
+            iteration=0,
+        )
+        assert type(output2) == Float8Tensor
+        assert output2._fp8_dtype == tex.DType.kFloat8E5M2
+
+        assert not debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1",
+            gemm="wgrad",
+            tensor_name="gradient",
+            iteration=0,
+        )
+
+        assert not debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc4",
+            gemm="fprop",
+            tensor_name="activation",
+            iteration=0,
+        )
+    finally:
+        debug_api.end_debug()
+
+
+def test_fake_quant(configs_dir, feature_dirs):
+    try:
+        debug_api.initialize(
+            configs_dir + "fake_quantization_config.yaml", feature_dirs=feature_dirs
+        )
+
+        tensor = torch.rand(24, 2046).cuda()
+
+        # modify_tensor_enabled
+        assert debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="fprop", tensor_name="activation", iteration=0
+        )
+
+        assert debug_api.transformer_engine.modify_tensor_enabled(
+            "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="gradient", iteration=0
+        )
+
+        # modify_tensor
+        debug_api.transformer_engine.modify_tensor(
+            "decoder.1.mlp.fc1",
+            gemm="fprop",
+            tensor=tensor,
+            tensor_name="activation",
+            iteration=0,
+            default_quantizer=None,
+        )
+
+        debug_api.transformer_engine.modify_tensor(
+            "decoder.1.mlp.fc1",
+            gemm="dgrad",
+            tensor=tensor,
+            tensor_name="gradient",
+            iteration=0,
+            default_quantizer=None,
+        )
+
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.fc2", gemm="wgrad", iteration=0
+        )
+        # caching
+        assert debug_api.transformer_engine.fp8_gemm_enabled(
+            "decoder.1.fc2", gemm="wgrad", iteration=0
+        )
+    finally:
+        debug_api.end_debug()
+
+
+def test_statistics_collection(configs_dir, feature_dirs):
+    try:
+        debug_api.initialize(
+            config_file=configs_dir + "stats_collection_test_config.yaml",
+            feature_dirs=feature_dirs,
+            default_logging_enabled=False,
+        )
+
+        tensor = torch.randn((100, 100, 5)).cuda()
+        tensor_fp8 = Float8Tensor(
+            data=tensor.to(torch.uint8).cuda(),
+            fp8_scale_inv=torch.full([1], 1.0).cuda(),
+            fp8_dtype=tex.DType.kFloat8E4M3,
+            shape=tensor.shape,
+            dtype=torch.float32,
+        )
+
+        def log():
+            from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS
+
+            return STATS_BUFFERS.log_stats()
+
+        def assert_empty():
+            stats = log()
+            assert len(stats) == 0
+
+        # TE tensor stats --
+        debug_api.transformer_engine.inspect_tensor(
+            "decoder.1.mlp.fc1",
+            tensor=tensor,
+            tensor_name="activation",
+            iteration=200,
+            tp_group=None,
+        )
+        stats = log()
+        assert stats[("decoder.1.mlp.fc1", "activation", "cur_amax", 200)] == tensor.abs().max()
+        assert not debug_api.transformer_engine.inspect_tensor_enabled(
+            "decoder.1.mlp.fc1", tensor_name="activation", iteration=201
+        )
+        assert not debug_api.transformer_engine.inspect_tensor_enabled(
+            "decoder.2.mlp.fc1", tensor_name="activation", iteration=200
+        )
+        assert not debug_api.transformer_engine.inspect_tensor_enabled(
+            "decoder.1.mlp.fc1", tensor_name="gradient", iteration=200
+        )
+
+        expected_underflows = (tensor_fp8._data == 0).sum() * 100 / (100 * 100 * 5)
+        expected_overflows = (tensor_fp8._data == 126).sum() * 100 / (100 * 100 * 5)
+
+        # TE FP8 tensor stats --
+        assert debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
+            "decoder.1.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200
+        )
+        debug_api.transformer_engine.inspect_tensor_postquantize(
+            "decoder.1.mlp.fc1",
+            tensor=tensor_fp8,
+            tensor_name="gradient",
+            iteration=200,
+            rowwise=True,
+            tp_group=None,
+        )
+        stats = log()
+        torch.testing.assert_close(
+            stats[("decoder.1.mlp.fc1", "gradient", "underflows%", 200)], expected_underflows
+        )
+
+        assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
+            "decoder.1.mlp.fc1", tensor_name="activation", gemm="fprop", iteration=201
+        )
+        assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
+            "decoder.2.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200
+        )
+
+        # Second config in same yaml
+        tensor = torch.rand((100, 100, 5))
+        debug_api.transformer_engine.inspect_tensor(
+            "decoder.6.mlp.fc1",
+            tensor=tensor,
+            tensor_name="activation",
+            iteration=200,
+            tp_group=None,
+        )
+        stats = log()
+        stats_names = [x[3] for x in stats.keys()]
+        all(s in stats_names for s in ["cur_amax", "dynamic_range", "mean", "std", "l1_norm"])
+        assert stats[("decoder.6.mlp.fc1", "activation", "mean", 200)] == tensor.mean()
+
+        debug_api.transformer_engine.inspect_tensor(
+            "decoder.7.mlp.fc1",
+            tensor=tensor,
+            tensor_name="weight",
+            iteration=200,
+            tp_group=None,
+        )
+        stats = log()
+        stats_names = [x[3] for x in stats.keys()]
+        all(s in stats_names for s in ["mean", "std", "l1_norm", "min", "max"])
+        assert stats[("decoder.7.mlp.fc1", "weight", "max", 200)] == tensor.max()
+
+        assert not debug_api.transformer_engine.inspect_tensor_enabled(
+            "decoder.7.mlp.fc1", tensor_name="weight", iteration=201
+        )
+        assert_empty()
+
+    finally:
+        debug_api.end_debug()
+
+
+def test_statistics_multi_run(configs_dir, feature_dirs):
+    try:
+        debug_api.initialize(
+            config_file=configs_dir + "stats_collection_test_config.yaml",
+            feature_dirs=feature_dirs,
+            default_logging_enabled=False,
+        )
+
+        def feed(tensor, tensor_fp8):
+            debug_api.transformer_engine.inspect_tensor(
+                "decoder.5.mlp.fc1",
+                tensor=tensor,
+                tensor_name="activation",
+                iteration=1,
+                tp_group=None,
+            )
+            debug_api.transformer_engine.inspect_tensor_postquantize(
+                "decoder.5.mlp.fc1",
+                tensor=tensor_fp8,
+                tensor_name="activation",
+                iteration=1,
+                rowwise=True,
+                tp_group=None,
+            )
+
+        def log_stats():
+            from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS
+
+            return STATS_BUFFERS.log_stats()
+
+        def fp8_tensor(t):
+            return Float8Tensor(
+                data=t.to(torch.uint8).cuda(),
+                fp8_scale_inv=torch.ones([1]).cuda(),
+                fp8_dtype=tex.DType.kFloat8E4M3,
+                shape=t.shape,
+                dtype=torch.float32,
+            )
+
+        shape = [1024, 1024]
+        tensors = [torch.randn(shape) for _ in range(2)]
+        tensors_fp8 = [fp8_tensor(tensors[i]) for i in range(2)]
+
+        feed(tensors[0], tensors_fp8[0])
+        feed(tensors[1], tensors_fp8[1])
+        stats1 = log_stats()
+
+        tensor2 = torch.cat((tensors[0], tensors[1])).cuda()
+        fp8tensor2 = fp8_tensor(tensor2)
+        feed(tensor2, fp8tensor2)
+        stats2 = log_stats()
+
+        assert len(stats1.keys()) > 0
+        for k in stats1.keys():
+            torch.testing.assert_close(stats1[k], stats2[k])
+    finally:
+        debug_api.end_debug()
+
+
+if __name__ == "__main__":
+    pass
--- a/tests/pytorch/debug/test_config.py
+++ b/tests/pytorch/debug/test_config.py
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+import pathlib, os
+
+from nvdlfw_inspect.config_manager import ConfigManager
+
+import nvdlfw_inspect.api as debug_api
+
+try:
+    import transformer_engine
+    from transformer_engine.debug.features.api import TEConfigAPIMapper
+except (ImportError, ModuleNotFoundError):
+    print("Could not find TransformerEngine debug module.")
+    exit(1)
+
+
+def test_transformer_engine_config_parsing(feature_dirs):
+    debug_api.initialize(
+        config_file=pathlib.Path(__file__).resolve().parent
+        / "test_configs/tensor_manipulation_transformer_engine.yaml",
+        feature_dirs=feature_dirs,
+        log_dir="./log",
+    )
+
+    cfg_fc1 = ConfigManager.get_config_for_layer("decoder.1.mlp.fc1")["transformer_engine"]
+    cfg_fc2 = ConfigManager.get_config_for_layer("decoder.1.mlp.fc2")["transformer_engine"]
+    assert cfg_fc1 and cfg_fc2
+
+    gemm_parsing = True
+    tensor_parsing = True
+
+    # Per tensor scaling set for dgrad, filter based on gemm
+    ret, _ = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc1["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="wgrad",
+        tensor_name="activation",
+    )
+    assert not ret
+
+    # per tensor scaling set for gradient, filter based on tensor name
+    ret, _ = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc1["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="dgrad",
+        tensor_name="activation",
+    )
+    assert not ret
+
+    ret, parsed_cfg_fc1 = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc1["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="dgrad",
+        tensor_name="gradient",
+    )
+    assert ret
+    assert parsed_cfg_fc1 == {"gemm": "dgrad", "tensor": "gradient"}
+
+    # Test tensor struct
+    ret, parsed_cfg_fc1_act = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc1["FakeQuant"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="fprop",
+        tensor_name="activation",
+    )
+    ret, parsed_cfg_fc1_wei = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc1["FakeQuant"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="fprop",
+        tensor_name="weight",
+    )
+    assert ret
+    assert parsed_cfg_fc1_act == {
+        "gemm": "fprop",
+        "tensor": "activation",
+        "quant_format": "FP8E4M3",
+    }
+    assert parsed_cfg_fc1_wei == {
+        "gemm": "fprop",
+        "tensor": "weight",
+        "quant_format": "FP8E4M3",
+    }
+
+    # Test gemms struct
+    ret, parsed_cfg_fc2_grad = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc2["FakeQuant"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="dgrad",
+        tensor_name="gradient",
+    )
+    assert ret
+    assert parsed_cfg_fc2_grad == {"gemm": "dgrad", "tensor": "gradient", "quant_format": "FP8E5M2"}
+    ret, parsed_cfg_fc2_wei = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc2["FakeQuant"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="dgrad",
+        tensor_name="weight",
+    )
+    assert ret
+    assert parsed_cfg_fc2_wei == {"gemm": "dgrad", "tensor": "weight", "quant_format": "FP8E5M2"}
+
+    # Test gemm + tensor struct
+    ret, parsed_cfg_fc2_fprop_act = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc2["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="fprop",
+        tensor_name="activation",
+    )
+    assert ret
+    assert parsed_cfg_fc2_fprop_act == {"gemm": "fprop", "tensor": "activation"}
+
+    ret, parsed_cfg_fc2_fprop_wei = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc2["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="fprop",
+        tensor_name="weight",
+    )
+    assert ret
+    assert parsed_cfg_fc2_fprop_wei == {"gemm": "fprop", "tensor": "weight"}
+
+    ret, parsed_cfg_fc2_wgrad_act = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc2["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="wgrad",
+        tensor_name="activation",
+    )
+    assert ret
+    assert parsed_cfg_fc2_wgrad_act == {"gemm": "wgrad", "tensor": "activation"}
+
+    ret, parsed_cfg_fc2_wgrad_grad = TEConfigAPIMapper().parse_config_and_api(
+        cfg_fc2["PerTensorScaling"],
+        gemm_parsing=gemm_parsing,
+        tensor_parsing=tensor_parsing,
+        gemm="wgrad",
+        tensor_name="gradient",
+    )
+    assert ret
+    assert parsed_cfg_fc2_wgrad_grad == {"gemm": "wgrad", "tensor": "gradient"}
+
+    ConfigManager.reset()
--- a/tests/pytorch/debug/test_configs/disable_fp8_gemms.yaml
+++ b/tests/pytorch/debug/test_configs/disable_fp8_gemms.yaml
+test_disable_fp8_gemm_1:
+  enabled: True
+  layers:
+    layer_types: [qkv, fc2]
+  transformer_engine:
+    DisableFP8GEMM:
+      enabled: True
+      gemms: [dgrad, wgrad]
\ No newline at end of file
--- a/tests/pytorch/debug/test_configs/disable_fp8_layer.yaml
+++ b/tests/pytorch/debug/test_configs/disable_fp8_layer.yaml
+test_disable_fp8_layer:
+  enabled: True
+  layers:
+    layer_types: [qkv]
+  transformer_engine:
+    DisableFP8Layer:
+      enabled: True
\ No newline at end of file
--- a/tests/pytorch/debug/test_configs/dummy_feature.yaml
+++ b/tests/pytorch/debug/test_configs/dummy_feature.yaml
+deummy_feature_everywhere:
+  enabled: True
+  layers:
+     layer_name_regex_pattern: .*
+  transformer_engine:
+    TestDummyFeature:
+      enabled: True
+      tensors: [weight, activation, gradient, output, wgrad, dgrad]
+      gemms: [wgrad, dgrad, fprop]
\ No newline at end of file
--- a/tests/pytorch/debug/test_configs/fake_quantization_config.yaml
+++ b/tests/pytorch/debug/test_configs/fake_quantization_config.yaml
+test_fake_quant_fp8:
+  enabled: True
+  layers:
+    layer_numbers: [1]
+    layer_types: [fc1, fc2]
+  transformer_engine:
+    FakeQuant:
+      enabled: True
+      gemms: [fprop, dgrad]
+      tensors_struct:
+        - tensor: activation
+          quant_format: FP8E4M3
+        - tensor: gradient
+          quant_format: FP8E5M2
\ No newline at end of file
--- a/tests/pytorch/debug/test_configs/per_tensor_scaling.yaml
+++ b/tests/pytorch/debug/test_configs/per_tensor_scaling.yaml
+test_per_tensor_scaling:
+  enabled: True
+  layers:
+    layer_numbers: [1]
+    layer_types: [fc1, fc2]
+  transformer_engine:
+    DisableFP8GEMM:
+      enabled: True
+      gemms: [wgrad]
+    PerTensorScaling:
+      enabled: True
+      gemms_struct:
+        - gemm: fprop
+          tensors_struct:
+            - tensor: activation
+            - tensor: weight
+        - gemm: dgrad
+          tensors_struct:
+            - tensor: gradient
\ No newline at end of file
--- a/tests/pytorch/debug/test_configs/stats_collection_test_config.yaml
+++ b/tests/pytorch/debug/test_configs/stats_collection_test_config.yaml
+stat_collection_test_1:
+  enabled: True
+  layers:
+    layer_numbers: [1, 3]
+  LogTensorStats:
+    enabled: True
+    stats: [mean, std, l1_norm, l2_norm]
+    tensors: [activation]
+    freq: 1
+    start_step: 100
+    end_step: 500
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      stats: [cur_amax, dynamic_range]
+      tensors: [activation]
+      freq: 2
+      start_step: 100
+      end_step: 500
+    LogFp8TensorStats:
+      enabled: True
+      stats: [underflows%]
+      tensors: [gradient]
+      freq: 5
+      start_step: 100
+      end_step: 500
+  
+stat_collection_test_2:
+  enabled: True
+  layers:
+    layer_numbers: [6, 7]
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors_struct:
+        - tensor: activation
+          stats: [cur_amax, dynamic_range, mean, std, l1_norm]
+          freq: 2
+          start_step: 100
+          end_step: 500
+        - tensor: weight
+          stats: [mean, std, l1_norm, min, max]
+          freq: 5
+          start_step: 100
+          end_step: 500
+  
+stat_collection_test_4:
+  enabled: True
+  layers:
+    layer_numbers: [5]
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors: [activation]
+      stats: [cur_amax, dynamic_range, mean, std, l1_norm]
+    LogFp8TensorStats:
+      enabled: True
+      stats: [underflows%]
+      tensors: [activation]
\ No newline at end of file
--- a/tests/pytorch/debug/test_configs/tensor_manipulation_transformer_engine.yaml
+++ b/tests/pytorch/debug/test_configs/tensor_manipulation_transformer_engine.yaml
+# This config is used when FP8 training is ON
+
+transformer_engine_fc1_manipulation:
+  enabled: True
+  layers:
+    layer_name_regex_pattern: .*(fc1) # Select layers if they end in fc1
+  transformer_engine: # namespace
+    DisableFP8GEMM: # Disable FP8 GEMM. FProp run in high precision
+      enabled: True
+      gemms: [fprop]
+    PerTensorScaling: # Scale DGrad gradients using per tensor current scaling and run FP8 GEMM
+      enabled: True
+      gemms: [dgrad]
+      tensors: [gradient]
+    FakeQuant: # Disable FP8 GEMM for Wgrad. Fake quantize activations to Wgrad and run high precision GEMM
+      enabled: True
+      gemms: [fprop]
+      tensors_struct:
+        - tensor: activation
+          quant_format: FP8E4M3
+        - tensor: weight
+          quant_format: FP8E4M3
+
+transformer_engine_fc2_manipulation:
+  enabled: True
+  layers:
+    layer_name_regex_pattern: .*(fc2) # Select layers if they end in fc2
+  transformer_engine: # namespace
+    PerTensorScaling: # Scale WGrad and Fprop inputs using per tensor current scaling and run FP8 GEMM
+      enabled: True
+      gemms_struct:
+        - gemm: fprop
+          tensors_struct:
+            - tensor: activation
+            - tensor: weight
+        - gemm: wgrad
+          tensors_struct:
+            - tensor: activation
+            - tensor: gradient
+    FakeQuant: # Disable FP8 GEMM for DGrad. Fake quantize weights and gradients to DGrad and run high precision GEMM
+      enabled: True
+      gemms_struct:
+        - gemm: dgrad
+          tensors: [weight, gradient]
+          quant_format: FP8E5M2
\ No newline at end of file
--- a/tests/pytorch/debug/test_distributed.py
+++ b/tests/pytorch/debug/test_distributed.py
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import subprocess
+from pathlib import Path
+
+import pytest
+import torch
+
+"""
+    Distributed numerics tests
+
+    These tests test the numerical corectness of the TransformerEngine layers.
+    Tests are parametrized by the layer and fp8 precision.
+    One test consists of running multiple configurations from file run_numerics.py
+    Such design is due to the fact the initialization of one test is long
+    - 2 processes need to start and load torch and TE. Multiple configurations
+    are run in one test - this reduces the initialization overhead.
+
+"""
+
+
+if torch.cuda.device_count() < 2:
+    pytest.skip("Distributed training needs at least 2 GPUs.")
+
+TEST_ROOT = Path(__file__).parent.resolve()
+NUM_PROCS: int = min(4, torch.cuda.device_count())
+LAUNCH_CMD = ["torchrun", f"--nproc_per_node={NUM_PROCS}"]
+
+
+def test_debug_distributed(feature_dirs):
+    test_path = TEST_ROOT / "run_distributed.py"
+    test_cmd = LAUNCH_CMD + [str(test_path), f"--feature_dirs={feature_dirs[0]}"]
+
+    result = subprocess.run(test_cmd, env=os.environ, capture_output=True, check=False)
+    if result.returncode != 0:
+        raise AssertionError(result.stderr.decode())
--- a/tests/pytorch/debug/test_numerics.py
+++ b/tests/pytorch/debug/test_numerics.py
--- a/tests/pytorch/debug/test_sanity.py
+++ b/tests/pytorch/debug/test_sanity.py
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import functools
+import itertools
+import os
+import random
+import tempfile
+from string import Template
+
+import pytest
+import torch
+
+import nvdlfw_inspect.api as debug_api
+import transformer_engine.debug
+import transformer_engine.pytorch as te
+import transformer_engine_torch as tex
+from transformer_engine.common.recipe import DelayedScaling, Format
+from transformer_engine.pytorch.constants import TE_DType
+from transformer_engine.pytorch.fp8 import _default_sf_compute
+from transformer_engine.pytorch.tensor.float8_tensor import Float8Quantizer
+
+from test_numerics import create_config_file
+
+B, S, H, D = 64, 64, 64, 64
+
+model_keys = ["linear", "layernorm_linear", "layernorm_mlp", "mha_attention", "transformer_layer"]
+
+configs = {
+    "": "",
+    "log": """log:
+  layers:
+    layer_types: [linear]
+  enabled:
+    True
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors: [activation, gradient, weight, output, wgrad, dgrad]
+      stats: [min, max, mean, std, l1_norm, l2_norm, cur_amax, dynamic_range]
+      start_step : 0
+      end_step: 1
+    LogFp8TensorStats:
+      enabled: True
+      tensors: [activation, gradient, weight]
+      stats: [underflows, overflows]
+      start_step : 0
+      end_step: 1
+""",
+    "fake_quant": """
+fake_quant_config:
+  enabled: True
+  layers:
+    layer_types: [linear]
+  transformer_engine:
+    FakeQuant:
+      enabled: True
+      gemms: [fprop, dgrad, wgrad]
+      quant_format: FP8E5M2
+""",
+}
+
+
+def _get_model(model_key):
+    if model_key == "linear":
+        return te.Linear(D, D)
+    if model_key == "layernorm_linear":
+        return te.LayerNormLinear(D, D)
+    if model_key == "layernorm_mlp":
+        return te.LayerNormMLP(D, D, D)
+    if model_key == "mha_attention":
+        return te.MultiheadAttention(D, H)
+    if model_key == "transformer_layer":
+        return te.TransformerLayer(D, D, H)
+
+
+def _run_forward_backward(model, fp8):
+    for _ in range(3):
+        inp = torch.randn((S, B, H)).cuda()
+        with te.fp8_autocast(enabled=fp8):
+            out = model(inp)
+        out.sum().backward()
+        debug_api.step()
+
+
+@create_config_file
+def _run_test(model_key, fp8, config, feature_dirs, config_file, log_dir):
+    try:
+        if config != "":
+            config_file.write(config)
+            config_file.flush()
+        config_file_name = config_file.name if config != "" else ""
+        debug_api.initialize(feature_dirs=feature_dirs, config_file=config_file_name)
+        model = _get_model(model_key)
+        _run_forward_backward(model, fp8)
+    except Exception as error:
+        raise error
+    finally:
+        debug_api.end_debug()
+
+
+@pytest.mark.parametrize("model_key", model_keys)
+@pytest.mark.parametrize("fp8", [False, True])
+@pytest.mark.parametrize("config_key", configs.keys())
+def test_sanity_debug(model_key, fp8, config_key, feature_dirs):
+    _run_test(model_key, fp8, configs[config_key], feature_dirs)
--- a/tests/pytorch/debug/utils.py
+++ b/tests/pytorch/debug/utils.py
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+
+LOG_FILE = os.path.join("nvdlfw_inspect_logs", "nvdlfw_inspect_globalrank-0.log")
+
+
+def reset_debug_log():
+    if os.path.isfile(LOG_FILE):
+        # delete all content
+        with open(LOG_FILE, "w") as f:
+            pass
+
+
+def check_debug_log(msg):
+    with open(LOG_FILE, "r") as f:
+        for line in f.readlines():
+            if msg in line:
+                return True
+    return False
--- a/tests/pytorch/distributed/run_numerics.py
+++ b/tests/pytorch/distributed/run_numerics.py
@@ -34,6 +34,18 @@ NCCL_WORLD = None
 LOSS_FN = nn.MSELoss()
 QUANTIZATION = None

+if os.environ.get("NVTE_TEST_NVINSPECT_ENABLED", False):
+    # The numerics of all the layers should work the same,
+    # when debug=True. I fed them with dummy feature
+    # to prevent switching off debug, which can happen if
+    # no feature is active.
+    import nvdlfw_inspect.api as debug_api
+
+    debug_api.initialize(
+        os.environ["NVTE_TEST_NVINSPECT_CONFIG_FILE"],
+        feature_dirs=os.environ["NVTE_TEST_NVINSPECT_FEATURE_DIRS"],
+    )
+

 # Disable TF32
 torch.backends.cuda.matmul.allow_tf32 = False

--- a/tests/pytorch/test_numerics.py
+++ b/tests/pytorch/test_numerics.py
@@ -102,6 +102,20 @@ all_normalizations = ["LayerNorm", "RMSNorm"]

 mask_types = ["causal", "no_mask"]

+NVTE_TEST_NVINSPECT_ENABLED = os.environ.get("NVTE_TEST_NVINSPECT_ENABLED", False)
+
+if NVTE_TEST_NVINSPECT_ENABLED:
+    # The numerics of all the layers should work the same,
+    # when debug=True. I fed them with dummy feature
+    # to prevent switching off debug, which can happen if
+    # no feature is active.
+    import nvdlfw_inspect.api as debug_api
+
+    debug_api.initialize(
+        os.environ["NVTE_TEST_NVINSPECT_CONFIG_FILE"],
+        feature_dirs=os.environ["NVTE_TEST_NVINSPECT_FEATURE_DIRS"],
+    )
+
 fp8_recipes = [
    recipe.MXFP8BlockScaling(),
    recipe.DelayedScaling(),
@@ -568,6 +582,8 @@ def test_gpt_selective_activation_recompute(dtype, bs, model, fp8, recipe, fp8_m
        pytest.skip(reason_for_no_fp8)
    if recipe.mxfp8() and not mxfp8_available:
        pytest.skip(reason_for_no_mxfp8)
+    if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
+        pytest.skip("FP8 parameters are not supported in debug mode.")
    if recipe.float8_block_scaling() and not fp8_block_scaling_available:
        pytest.skip(reason_for_no_fp8_block_scaling)

@@ -682,6 +698,8 @@ def test_gpt_full_activation_recompute(
        pytest.skip(reason_for_no_fp8)
    if recipe.mxfp8() and not mxfp8_available:
        pytest.skip(reason_for_no_mxfp8)
+    if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
+        pytest.skip("FP8 parameters are not supported in debug mode.")
    if recipe.float8_block_scaling() and not fp8_block_scaling_available:
        pytest.skip(reason_for_no_fp8_block_scaling)

@@ -1726,6 +1744,8 @@ def test_grouped_linear_accuracy(
        pytest.skip(reason_for_no_fp8)
    if fp8 and recipe.mxfp8() and not mxfp8_available:
        pytest.skip(reason_for_no_mxfp8)
+    if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
+        pytest.skip("FP8 parameters are not supported in debug mode.")
    if fp8 and recipe.float8_block_scaling() and not fp8_block_scaling_available:
        pytest.skip(reason_for_no_fp8_block_scaling)

@@ -1924,6 +1944,8 @@ def test_padding_grouped_linear_accuracy(
        pytest.skip(reason_for_no_fp8)
    if recipe.mxfp8() and not mxfp8_available:
        pytest.skip(reason_for_no_mxfp8)
+    if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
+        pytest.skip("FP8 parameters are not supported in debug mode.")
    if recipe.float8_block_scaling() and not fp8_block_scaling_available:
        pytest.skip(reason_for_no_fp8_block_scaling)

@@ -2039,6 +2061,8 @@ def _test_gpt_e2e_cuda_graph(block, bs, dtype, config, graph):
 @pytest.mark.parametrize("bs", batch_sizes)
 @pytest.mark.parametrize("model", ["126m"])
 def test_gpt_cuda_graph(dtype, bs, model):
+    if NVTE_TEST_NVINSPECT_ENABLED:
+        pytest.skip("Cuda Graphs are not supported in debug mode.")
    config = model_configs[model]

    sigma = 0.023
@@ -2136,6 +2160,8 @@ def test_gpt_fp8_parameters(dtype, bs, model, recipe):
        pytest.skip(reason_for_no_fp8)
    if recipe.mxfp8() and not mxfp8_available:
        pytest.skip(reason_for_no_mxfp8)
+    if NVTE_TEST_NVINSPECT_ENABLED:
+        pytest.skip("FP8 parameters are not supported in debug mode.")
    if recipe.float8_block_scaling() and not fp8_block_scaling_available:
        pytest.skip(reason_for_no_fp8_block_scaling)


--- a/transformer_engine/debug/features/api.py
+++ b/transformer_engine/debug/features/api.py
@@ -12,7 +12,7 @@ from nvdlfw_inspect.registry import Registry
 import torch

 from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS
-from transformer_engine.pytorch.tensor import all_tensor_types
+from transformer_engine.pytorch.tensor import get_all_tensor_types
 from transformer_engine.debug.pytorch.debug_state import TEDebugState
 from transformer_engine.pytorch.tensor import Quantizer, QuantizedTensor

@@ -424,7 +424,7 @@ class TransformerEngineAPI(BaseNamespaceAPI):
        if api_name in ["inspect_tensor", "inspect_tensor_postquantize"]:
            assert ret is None
        if api_name == "modify_tensor":
-            assert type(ret) in all_tensor_types
+            assert type(ret) in get_all_tensor_types()
            if (
                type(ret) == torch.Tensor  # pylint: disable=unidiomatic-typecheck
                and "dtype" in kwargs
@@ -438,4 +438,4 @@ class TransformerEngineAPI(BaseNamespaceAPI):

    def end_debug(self):
        """This function is called by the nvidia-dlframework-inspect after every debug_api.end_debug()"""
-        TEDebugState.reset()
+        TEDebugState._reset()