[PyTorch Debug] Skip some of debug tests if FP8 is not available. (#1902)

* fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update tests/pytorch/debug/test_distributed.py Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Signed-off-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com> --------- Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> Signed-off-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>

[PyTorch Debug] Skip some of debug tests if FP8 is not available. (#1902)
* fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update tests/pytorch/debug/test_distributed.py Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Signed-off-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com> --------- Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com> Signed-off-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
1ae1d228 · Paweł Gadziński · GitHub · 6f4310d7 · 1ae1d228 · 1ae1d228
Unverified Commit 1ae1d228 authored Jul 04, 2025 by Paweł Gadziński Committed by GitHub Jul 04, 2025
4 changed files
--- a/tests/pytorch/debug/run_distributed.py
+++ b/tests/pytorch/debug/run_distributed.py
@@ -16,7 +16,7 @@ import transformer_engine
 import transformer_engine_torch as tex
 import nvdlfw_inspect.api as debug_api
 from transformer_engine.debug import set_weight_tensor_tp_group_reduce
+from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
 from test_numerics import (
    _emulate_linear,
@@ -45,6 +45,8 @@ FEATURE_DIRS = None
 all_boolean = [True, False]
 TEST_NR = 0
+fp8_available, _ = FP8GlobalStateManager.is_fp8_available()
 def _get_tensors(parallel_mode, weight_seed=SEED, data_seed=SEED, tp_size=None, tp_rank=None):
    if tp_size is None:
@@ -221,7 +223,7 @@ def run_debug_test(func):
    return wrapper
-CONFIG_LOG_TEST_DISTRIBUTED = """log_distributed:
+CONFIG_LOG_TEST_DISTRIBUTED_FP8 = """log_distributed:
  layers:
    layer_types: [linear]
  enabled:
@@ -241,11 +243,27 @@ CONFIG_LOG_TEST_DISTRIBUTED = """log_distributed:
      end_step: 1
 """
+CONFIG_LOG_TEST_DISTRIBUTED_NO_FP8 = """log_distributed:
+  layers:
+    layer_types: [linear]
+  enabled:
+    True
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors: [activation, gradient, weight, output, wgrad, dgrad]
+      stats: [min, max, mean, std, l1_norm, l2_norm, cur_amax, dynamic_range]
+      start_step : 0
+      end_step: 1
+"""
 def _prepare_config_test_log_distributed(config_file):
    if WORLD_RANK != 0:
        return
-    config_file.write(CONFIG_LOG_TEST_DISTRIBUTED)
+    config_file.write(
+        CONFIG_LOG_TEST_DISTRIBUTED_FP8 if fp8_available else CONFIG_LOG_TEST_DISTRIBUTED_NO_FP8
+    )
    config_file.flush()
@@ -361,13 +379,13 @@ def test_log_expert_parallel(**kwargs):
    )  # data parallel
    model = _init_model(weight, parallel_mode=None, name="linear1")
    model1 = _init_model(weight, parallel_mode=None, name="linear2")
-    with transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+    with transformer_engine.pytorch.fp8_autocast(enabled=fp8_available, fp8_recipe=FP8_RECIPE):
        y1 = model(x)
        y2 = model1(x)
        y = y1 + y2
    y.sum().backward()
    debug_api.step()
-    with transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+    with transformer_engine.pytorch.fp8_autocast(enabled=fp8_available, fp8_recipe=FP8_RECIPE):
        y = model(x)
        if WORLD_RANK != 0:
            y = y + model1(x)
@@ -620,28 +638,29 @@ if __name__ == "__main__":
        for gather_weight in [True, False]:
            test_log_distributed(parallel_mode, gather_weight)
-    for parallel_mode in ["row", "column"]:
+    if fp8_available:
-        test_disable_fp8_layer(parallel_mode)
+        for parallel_mode in ["row", "column"]:
+            test_disable_fp8_layer(parallel_mode)
-    # test_disable_fp8_gemms
+        # test_disable_fp8_gemms
-    _run_test_with_combinations(
+        _run_test_with_combinations(
-        test_disable_fp8_gemms, all_boolean, num_repeat=3, extra_args=["column", "row"]
+            test_disable_fp8_gemms, all_boolean, num_repeat=3, extra_args=["column", "row"]
-    )
+        )
-    # test_fake_quant_fp8
+        # test_fake_quant_fp8
-    dtype_options = [tex.DType.kFloat8E4M3, tex.DType.kFloat8E5M2, None]
+        dtype_options = [tex.DType.kFloat8E4M3, tex.DType.kFloat8E5M2, None]
-    _run_test_with_combinations(
+        _run_test_with_combinations(
-        test_fake_quant_fp8,
+            test_fake_quant_fp8,
-        dtype_options,
+            dtype_options,
-        num_repeat=6,
+            num_repeat=6,
-        extra_args=["column", "row"],
+            extra_args=["column", "row"],
-        sample_size=20,
+            sample_size=20,
-    )
+        )
-    _run_test_with_combinations(
+        _run_test_with_combinations(
-        test_per_tensor_scaling,
+            test_per_tensor_scaling,
-        all_boolean,
+            all_boolean,
-        num_repeat=6,
+            num_repeat=6,
-        extra_args=["column"],
+            extra_args=["column"],
-        sample_size=20,
+            sample_size=20,
-    )
+        )
--- a/tests/pytorch/debug/test_distributed.py
+++ b/tests/pytorch/debug/test_distributed.py
@@ -5,7 +5,6 @@
 import os
 import subprocess
 from pathlib import Path
 import pytest
 import torch
@@ -21,7 +20,6 @@ import torch
 """
 if torch.cuda.device_count() < 2:
    pytest.skip("Distributed training needs at least 2 GPUs.")

--- a/tests/pytorch/debug/test_numerics.py
+++ b/tests/pytorch/debug/test_numerics.py
@@ -27,6 +27,9 @@ from transformer_engine.pytorch.module.base import (
    _2X_ACC_FPROP,
    _2X_ACC_WGRAD,
 )
+from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
+fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available()
 all_boolean = [True, False]
 FP8_FORMAT = Format.HYBRID
@@ -246,8 +249,8 @@ def _init_model(weight):
    return model
-def _run_forward_backward(x, model, loss_scale=1.0, is_first_microbatch=None):
+def _run_forward_backward(x, model, loss_scale=1.0, is_first_microbatch=None, fp8=True):
-    with tepytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+    with tepytorch.fp8_autocast(enabled=fp8, fp8_recipe=FP8_RECIPE):
        y = model(x, is_first_microbatch=is_first_microbatch)
    (y.sum() * loss_scale).backward()
    debug_api.step()
@@ -297,8 +300,8 @@ def run_logging_zero_numel_tensor(feature_dirs, **kwargs):
    x, weight = _get_tensors()
    x1 = x[:0, :]
    model = _init_model(weight)
-    _ = _run_forward_backward(x1, model)
+    _ = _run_forward_backward(x1, model, fp8=False)
-    _ = _run_forward_backward(x, model)
+    _ = _run_forward_backward(x, model, fp8=False)
 def test_logging_zero_numel_tensor(feature_dirs):
@@ -309,6 +312,8 @@ def test_logging_zero_numel_tensor(feature_dirs):
 @pytest.mark.parametrize("dgrad_fp8", all_boolean)
 @pytest.mark.parametrize("wgrad_fp8", all_boolean)
 def test_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8):
+    if not fp8_available:
+        pytest.skip(reason_for_no_fp8)
    run_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8)
@@ -348,6 +353,8 @@ def run_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8, **kwarg
 def test_disable_fp8_layer(feature_dirs):
+    if not fp8_available:
+        pytest.skip(reason_for_no_fp8)
    run_disable_fp8_layer(feature_dirs)
@@ -393,6 +400,8 @@ subset_combinations = random.sample(all_combinations, 20)
 def test_per_tensor_scaling(
    feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
 ):
+    if not fp8_available:
+        pytest.skip(reason_for_no_fp8)
    if not any([fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad]):
        pytest.skip("Skipping test because all parameters are False")
    run_per_tensor_scaling(
@@ -565,6 +574,8 @@ def run_per_tensor_scaling(
 def test_microbatching_per_tensor_scaling(
    feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
 ):
+    if not fp8_available:
+        pytest.skip(reason_for_no_fp8)
    if not any([fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad]):
        pytest.skip("Skipping test because all parameters are False")
@@ -654,6 +665,8 @@ subset_combinations = random.sample(all_combinations, 10)
 def test_fake_quant_fp8(
    feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
 ):
+    if not fp8_available:
+        pytest.skip(reason_for_no_fp8)
    run_fake_quant_fp8(
        feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
    )

--- a/tests/pytorch/debug/test_sanity.py
+++ b/tests/pytorch/debug/test_sanity.py
@@ -2,27 +2,17 @@
 #
 # See LICENSE for license information.
-import functools
-import itertools
-import os
-import random
-import tempfile
-from string import Template
 import pytest
 import torch
 import nvdlfw_inspect.api as debug_api
-import transformer_engine.debug
 import transformer_engine.pytorch as te
-import transformer_engine_torch as tex
+from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
-from transformer_engine.common.recipe import DelayedScaling, Format
-from transformer_engine.pytorch.constants import TE_DType
-from transformer_engine.pytorch.fp8 import _default_sf_compute
-from transformer_engine.pytorch.tensor.float8_tensor import Float8Quantizer
 from test_numerics import create_config_file
+fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available()
 B, S, H, D = 64, 64, 64, 64
 model_keys = ["linear", "layernorm_linear", "layernorm_mlp", "mha_attention", "transformer_layer"]
@@ -104,4 +94,6 @@ def _run_test(model_key, fp8, config, feature_dirs, config_file, log_dir):
 @pytest.mark.parametrize("fp8", [False, True])
 @pytest.mark.parametrize("config_key", configs.keys())
 def test_sanity_debug(model_key, fp8, config_key, feature_dirs):
+    if fp8 and not fp8_available:
+        pytest.skip(reason_for_no_fp8)
    _run_test(model_key, fp8, configs[config_key], feature_dirs)