Fix some test problem in pytorch unittest

62550505 · wenjh · 11864d3d · 62550505 · 62550505 · 62550505
Commit 62550505 authored Aug 25, 2025 by wenjh
6 changed files
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -40,7 +40,6 @@ NVTE_INT8_SIM_FP8=1 NVTE_INT8_SIM_FP8_TENSORWISE=1 python3 -m pytest -v -s test_
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_scaling_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_scaling_exact.py || test_fail "test_float8_blockwise_scaling_exact.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_gemm_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py || test_fail "test_float8_blockwise_gemm_exact.py"
 python3 $TE_PATH/tests/pytorch/test_int8_blockwise_gemm_exact.py
-NVTE_INT8_SIM_FP8=1 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_int8_blockwise_layers.xml $TE_PATH/tests/pytorch/test_int8_blockwise_layers.py || test_fail "test_int8_blockwise_layers.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py || test_fail "test_gqa.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py || test_fail "test_fused_optimizer.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_multi_tensor.xml $TE_PATH/tests/pytorch/test_multi_tensor.py || test_fail "test_multi_tensor.py"

--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -10,7 +10,9 @@
 #include <vector>
 #include <array>
 #include <random>
+#ifndef __HIP_PLATFORM_AMD__
 #include <cudaTypedefs.h>
+#endif
 #define FP4_TYPE_SUPPORTED (CUDA_VERSION >= 12080)
 #include <cuda_runtime_api.h>

--- a/tests/pytorch/test_float8_blockwise_scaling_exact.py
+++ b/tests/pytorch/test_float8_blockwise_scaling_exact.py
@@ -25,6 +25,7 @@ from test_float8_current_scaling_exact import (
    TestFP8RecipeLinearBase,
    TestFP8RecipeLayerNormLinearBase,
 )
+from torch.utils.cpp_extension import IS_HIP_EXTENSION
 # read env variable NVTE_TEST_FLOAT8_BLOCK_SCALING_EXACT_TENSOR_DUMP_DIR to override the default tensor dump directory
 TENSOR_DUMP_DIR = pathlib.Path(__file__).resolve().parent.parent.parent / "tensor_dumps"
@@ -529,6 +530,11 @@ class TestFP8BlockScalingRecipeLinear(TestFP8RecipeLinearBase):
        dtype,
        use_bias=True,
    ):
+        if IS_HIP_EXTENSION:
+            import importlib
+            ori_int8_sim_fp8 = os.environ.get("NVTE_INT8_SIM_FP8", None)
+            os.environ["NVTE_INT8_SIM_FP8"] = "1"
+            importlib.reload(te.pytorch.fp8)
        fp8_zero_tolerance_tensor_dumps_recipe2 = None
        # check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
        # if we cannot get all four tensors, then still set the tensor dump to None
@@ -554,6 +560,12 @@ class TestFP8BlockScalingRecipeLinear(TestFP8RecipeLinearBase):
            recipe1_golden_tensors=None,
            recipe2_golden_tensors=fp8_zero_tolerance_tensor_dumps_recipe2,
        )
+        if IS_HIP_EXTENSION:
+            if ori_int8_sim_fp8 is not None:
+                os.environ["NVTE_INT8_SIM_FP8"] = ori_int8_sim_fp8
+            else:
+                del os.environ["NVTE_INT8_SIM_FP8"]
+            importlib.reload(te.pytorch.fp8)
 @pytest.mark.skipif(not recipe_available, reason=reason_for_no_recipe)
@@ -589,6 +601,11 @@ class TestFP8BlockScalingRecipeLayerNormLinear(TestFP8RecipeLayerNormLinearBase)
        dtype,
        use_bias=True,
    ):
+        if IS_HIP_EXTENSION:
+            import importlib
+            ori_int8_sim_fp8 = os.environ.get("NVTE_INT8_SIM_FP8", None)
+            os.environ["NVTE_INT8_SIM_FP8"] = "1"
+            importlib.reload(te.pytorch.fp8)
        fp8_zero_tolerance_tensor_dumps_recipe2 = None
        # check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
        # if we cannot get all four tensors, then still set the tensor dump to None
@@ -612,11 +629,17 @@ class TestFP8BlockScalingRecipeLayerNormLinear(TestFP8RecipeLayerNormLinearBase)
            use_bias,
            seed=torch.initial_seed(),
            dtype=dtype,
-            y_error=0.5,
+            y_error=0.5  if not IS_HIP_EXTENSION else 0.9,
            ln_out_error=0.5,
-            dgrad_error=1.6,
+            dgrad_error=1.6 if not IS_HIP_EXTENSION else 1.0,
            wgrad_error=1,
            bgrad_error=0.5,
            recipe1_golden_tensors=None,
            recipe2_golden_tensors=fp8_zero_tolerance_tensor_dumps_recipe2,
        )
+        if IS_HIP_EXTENSION:
+            if ori_int8_sim_fp8 is not None:
+                os.environ["NVTE_INT8_SIM_FP8"] = ori_int8_sim_fp8
+            else:
+                del os.environ["NVTE_INT8_SIM_FP8"]
+            importlib.reload(te.pytorch.fp8)
--- a/tests/pytorch/test_float8blockwisetensor.py
+++ b/tests/pytorch/test_float8blockwisetensor.py
@@ -17,6 +17,7 @@ from transformer_engine.pytorch.tensor.float8_blockwise_tensor import (
    Float8BlockwiseQTensor,
 )
 from transformer_engine.pytorch.utils import get_device_compute_capability
+from torch.utils.cpp_extension import IS_HIP_EXTENSION
 import transformer_engine_torch as tex
 # PyTorch tensor dtypes
@@ -43,7 +44,10 @@ def _to_list(x: Union[Iterable, Any]) -> List:
 DimsType = Union[Iterable[int], int]
 # TODO replace with call to fp8.py when recipe added.
-recipe_available = get_device_compute_capability() >= (9, 0) and float(torch.version.cuda) >= 12.8
+if IS_HIP_EXTENSION:
+    recipe_available = get_device_compute_capability() >= (9, 0)
+else:
+    recipe_available = get_device_compute_capability() >= (9, 0) and float(torch.version.cuda) >= 12.8
 reason_for_no_recipe = "Quantize kernels require TMA and are only relevant with GEMMS."

--- a/tests/pytorch/test_int8_blockwise_layers.py
+++ b/tests/pytorch/test_int8_blockwise_layers.py
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# See LICENSE for license information.
-from typing import Tuple
-import math
-import os
-import pathlib
-import pytest
-import torch
-import transformer_engine as te
-import transformer_engine_torch as tex
-from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
-from transformer_engine.common.recipe import Float8BlockScaling
-from transformer_engine.pytorch.constants import TE_DType
-from transformer_engine.pytorch.tensor.float8_blockwise_tensor import (
-    Float8BlockQuantizer,
-    Float8BlockwiseQTensor,
-)
-from references.blockwise_quantizer_reference import (
-    BlockwiseQuantizerReference,
-    QuantizeResult,
-)
-from test_float8_current_scaling_exact import (
-    TestFP8RecipeLinearBase,
-    TestFP8RecipeLayerNormLinearBase,
-)
-import logging
-# read env variable NVTE_TEST_FLOAT8_BLOCK_SCALING_EXACT_TENSOR_DUMP_DIR to override the default tensor dump directory
-TENSOR_DUMP_DIR = pathlib.Path(__file__).resolve().parent.parent.parent / "tensor_dumps"
-tensor_dump_dir_env = os.getenv("NVTE_TEST_BLOCK_CURRENT_SCALING_EXACT_TENSOR_DUMP_DIR")
-if tensor_dump_dir_env is not None:
-    TENSOR_DUMP_DIR = pathlib.Path(tensor_dump_dir_env)
-recipe_available, reason_for_no_recipe = FP8GlobalStateManager.is_fp8_block_scaling_available()
-class GetRecipes:
-    @staticmethod
-    def none():
-        return None
-    @staticmethod
-    def fp8_blockwise():
-        # return default configs
-        return Float8BlockScaling()
-# FP8 per tesnor current scaling
-@pytest.mark.skipif(not recipe_available, reason=reason_for_no_recipe)
-class TestFP8BlockScalingRecipeLinear(TestFP8RecipeLinearBase):
-    @staticmethod
-    def setup_class(cls) -> None:
-        # Configure RNG
-        seed = 1234
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed(seed)
-    @pytest.mark.parametrize(
-        "batch_size, hidden_size, out_size",
-        [
-            (16, 256, 128),
-        ],
-    )
-    @pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
-    @pytest.mark.parametrize(
-        "recipe1, recipe2",
-        [
-            (GetRecipes.none, GetRecipes.fp8_blockwise),
-        ],
-    )
-    def test_fp8_current_scaling_with_linear_module(
-        self,
-        recipe1,
-        recipe2,
-        batch_size,
-        hidden_size,
-        out_size,
-        dtype,
-        use_bias=False,
-    ):
-        fp8_zero_tolerance_tensor_dumps_recipe2 = None
-        # check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
-        # if we cannot get all four tensors, then still set the tensor dump to None
-        tensor_map = self._check_golden_tensor_dumps(
-            TENSOR_DUMP_DIR, recipe2, (batch_size, hidden_size, out_size), dtype, use_bias
-        )
-        if tensor_map is not None:
-            fp8_zero_tolerance_tensor_dumps_recipe2 = tensor_map
-        assert recipe1 == GetRecipes.none, "Only None recipe is supported for recipe1"
-        self.compare_recipe(
-            recipe1,
-            recipe2,
-            batch_size,
-            hidden_size,
-            out_size,
-            use_bias,
-            seed=torch.initial_seed(),
-            dtype=dtype,
-            y_error=0.5,
-            dgrad_error=1,
-            wgrad_error=1,
-            bgrad_error=0.5,
-            recipe1_golden_tensors=None,
-            recipe2_golden_tensors=fp8_zero_tolerance_tensor_dumps_recipe2,
-        )
-@pytest.mark.skipif(not recipe_available, reason=reason_for_no_recipe)
-class TestFP8BlockScalingRecipeLayerNormLinear(TestFP8RecipeLayerNormLinearBase):
-    @staticmethod
-    def setup_class(cls) -> None:
-        # Configure RNG
-        seed = 1234
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed(seed)
-    @pytest.mark.parametrize(
-        "batch_size, hidden_size, out_size",
-        [
-            (16, 256, 128),
-        ],
-    )
-    @pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
-    @pytest.mark.parametrize(
-        "recipe1, recipe2",
-        [
-            (GetRecipes.none, GetRecipes.fp8_blockwise),
-        ],
-    )
-    def test_fp8_current_scaling_with_layernorm_linear_module(
-        self,
-        recipe1,
-        recipe2,
-        batch_size,
-        hidden_size,
-        out_size,
-        dtype,
-        use_bias=False,
-    ):
-        fp8_zero_tolerance_tensor_dumps_recipe2 = None
-        # check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
-        # if we cannot get all four tensors, then still set the tensor dump to None
-        tensor_map = self._check_golden_tensor_dumps(
-            TENSOR_DUMP_DIR,
-            recipe2,
-            (batch_size, hidden_size, out_size),
-            dtype,
-            use_bias,
-            "LayerNorm",
-        )
-        if tensor_map is not None:
-            fp8_zero_tolerance_tensor_dumps_recipe2 = tensor_map
-        self.compare_recipe(
-            recipe1,
-            recipe2,
-            batch_size,
-            hidden_size,
-            out_size,
-            use_bias,
-            seed=torch.initial_seed(),
-            dtype=dtype,
-            y_error=0.9,
-            ln_out_error=0.5,
-            dgrad_error=1,
-            wgrad_error=1,
-            bgrad_error=0.5,
-            recipe1_golden_tensors=None,
-            recipe2_golden_tensors=fp8_zero_tolerance_tensor_dumps_recipe2,
-        )
--- a/tests/pytorch/test_numerics.py
+++ b/tests/pytorch/test_numerics.py
@@ -2515,6 +2515,7 @@ def test_grouped_gemm(shape, dtype, layout, accumulate):
    # Force the sequential_linear and grouped_linear to use hipblaslt rather than hipblas
    if IS_HIP_EXTENSION:
+        ori_force_rocm_gemm = os.environ.get("NVTE_FORCE_ROCM_GEMM", None)
        os.environ["NVTE_FORCE_ROCM_GEMM"] = "1"
    for i in range(z):
@@ -2544,7 +2545,10 @@ def test_grouped_gemm(shape, dtype, layout, accumulate):
        single_output=single_output,
    )
    if IS_HIP_EXTENSION:
-        os.environ["NVTE_FORCE_ROCM_GEMM"] = "0"
+        if ori_force_rocm_gemm is not None:
+            os.environ["NVTE_FORCE_ROCM_GEMM"] = ori_force_rocm_gemm
+        else:
+            del os.environ["NVTE_FORCE_ROCM_GEMM"]
    # should be bit-wise match
    for o, o_ref in zip(out, out_ref):