Merge branch 'develop_v2.10' into release_v2.10

e4f5325e · wenjh · eebc98fc · a68e5f87 · e4f5325e · e4f5325e
Commit e4f5325e authored Feb 24, 2026 by wenjh
7 changed files
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -36,10 +36,12 @@ python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_nvfp4.xml $TE_PA
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8tensor.xml $TE_PATH/tests/pytorch/test_float8tensor.py || test_fail "test_float8tensor.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8blockwisetensor.xml $TE_PATH/tests/pytorch/test_float8blockwisetensor.py || test_fail "test_float8blockwisetensor.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_scaling_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_scaling_exact.py || test_fail "test_float8_blockwise_scaling_exact.py"
-NVTE_INT8_SIM_FP8=1 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_gemm_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py || test_fail "test_float8_blockwise_gemm_exact.py"
+NVTE_INT8_SIM_FP8=1 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_gemm_exact_int8.xml $TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py || test_fail "test_float8_blockwise_gemm_exact.py_int8"
+python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_gemm_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py || test_fail "test_float8_blockwise_gemm_exact.py"
 # channelwise int8 test
-NVTE_INT8_SIM_FP8=1 python3 -m pytest -v -s --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_current_scaling_exact.xml  $TE_PATH/tests/pytorch/test_float8_current_scaling_exact.py
+python3 -m pytest -v -s --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_current_scaling_exact.xml  $TE_PATH/tests/pytorch/test_float8_current_scaling_exact.py || test_fail "test_float8_current_scaling_exact.py"
-NVTE_INT8_SIM_FP8=1 NVTE_INT8_SIM_FP8_TENSORWISE=1 python3 -m pytest -v -s --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_current_scaling_exact.xml $TE_PATH/tests/pytorch/test_float8_current_scaling_exact.py
+NVTE_INT8_SIM_FP8=1 python3 -m pytest -v -s --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_current_scaling_exact_int8.xml  $TE_PATH/tests/pytorch/test_float8_current_scaling_exact.py || test_fail "test_float8_current_scaling_exact.py_int8"
+NVTE_INT8_SIM_FP8=1 NVTE_INT8_SIM_FP8_TENSORWISE=1 python3 -m pytest -v -s --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_current_scaling_exact_int8_tensorwise.xml $TE_PATH/tests/pytorch/test_float8_current_scaling_exact.py || test_fail "test_float8_current_scaling_exact.py_int8_tensorwise"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py || test_fail "test_gqa.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py || test_fail "test_fused_optimizer.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_multi_tensor.xml $TE_PATH/tests/pytorch/test_multi_tensor.py || test_fail "test_multi_tensor.py"

--- a/tests/pytorch/distributed/test_numerics.py
+++ b/tests/pytorch/distributed/test_numerics.py
@@ -51,11 +51,26 @@ def _run_test(quantization):
 all_boolean = [True, False]
 @pytest.mark.parametrize(
    "quantization", [None, "fp8", "mxfp8", "fp8_cs", "fp8_block_scaling", "nvfp4"]
 )
 def test_distributed(quantization):
+    if quantization == "fp8" and not fp8_available:
+        pytest.skip(reason_for_no_fp8)
+    if quantization == "fp8_cs" and not fp8_available:
+        pytest.skip(reason_for_no_fp8)
+    if quantization == "mxfp8" and not mxfp8_available:
+        pytest.skip(reason_for_no_mxfp8)
+    if quantization == "fp8_block_scaling" and not fp8_block_scaling_available:
+        pytest.skip(reason_for_no_fp8_block_scaling)
+    if quantization == "nvfp4" and not nvfp4_available:
+        pytest.skip(reason_for_no_nvfp4)
+    _run_test(quantization)
+@pytest.mark.parametrize(
+    "quantization", [None, "fp8", "mxfp8", "fp8_cs", "fp8_block_scaling", "nvfp4"]
+)
+def test_int8_distributed(quantization):
    if quantization == "fp8" and not fp8_available:
        pytest.skip(reason_for_no_fp8)
    if quantization == "fp8_cs" and not fp8_available:

--- a/tests/pytorch/test_float8_blockwise_gemm_exact.py
+++ b/tests/pytorch/test_float8_blockwise_gemm_exact.py
@@ -47,7 +47,7 @@ def cublas_gemm_fp8_blockwise_case(
    atol: float = 0.0,
    rtol: float = 0.0
 ):
-    if IS_HIP_EXTENSION and int8_simulation_fp8:
+    if IS_HIP_EXTENSION:
        if use_bias or use_gelu:
            pytest.skip("Bias and GELU not supported in int8 simulation mode on ROCm.")
        if not ((not x_columnwise and not w_columnwise and is_x_1d_scaled and not is_w_1d_scaled) or (not x_columnwise and w_columnwise and is_x_1d_scaled and not is_w_1d_scaled) or (x_columnwise and w_columnwise and is_x_1d_scaled and is_w_1d_scaled)):
@@ -168,7 +168,7 @@ def cublas_gemm_fp8_blockwise_case(
    bias_dtype = TE_DType[torch.bfloat16 if bias is None else bias.dtype]
-    if IS_HIP_EXTENSION and int8_simulation_fp8:  
+    if IS_HIP_EXTENSION and int8_simulation_fp8:
        if(not x_columnwise and not w_columnwise and is_x_1d_scaled and not is_w_1d_scaled):
            y = w8a8_int8_general_gemm(qw, qx, out_dtype, False, "TN", None)
        elif (not x_columnwise and w_columnwise and is_x_1d_scaled and not is_w_1d_scaled):
@@ -249,7 +249,7 @@ def cublas_gemm_test_constraint_enforced(
    expected_err_cls=RuntimeError
 ):
    if IS_HIP_EXTENSION:
-        pytest.skip("ROCm does not support cuBLAS GEMM. No need to test constraint enforcement.")
+        pytest.skip("ROCm does not support cuBLAS blockwise FP8 gemm. No need to test constraint enforcement.")
    if not fp8_blockwise_gemm_supported():
        pytest.skip("CUDA version does not support blockwise FP8 gemm.")
    # Setup device and random seed

--- a/tests/pytorch/test_float8_blockwise_scaling_exact.py
+++ b/tests/pytorch/test_float8_blockwise_scaling_exact.py
@@ -9,7 +9,7 @@ import pathlib
 import pytest
 import torch
 import transformer_engine.pytorch as te
-from transformer_engine.pytorch.fp8 import blockwise_fp8_block_len
+from transformer_engine.pytorch.fp8 import (FP8GlobalStateManager, blockwise_fp8_block_len)
 from transformer_engine.common.recipe import Float8BlockScaling
 from transformer_engine.pytorch.constants import TE_DType
 from transformer_engine.pytorch import (
@@ -507,6 +507,9 @@ def test_quantization_block_tiling_extrema_versus_reference(
        rtol=0.0,
    )
+def fp8_blockwise_scaling_supported() -> bool:
+    supported, _ = FP8GlobalStateManager.is_fp8_block_scaling_available()
+    return supported
 # FP8 per tesnor current scaling
 @pytest.mark.skipif(not recipe_available, reason=reason_for_no_recipe)
@@ -541,12 +544,65 @@ class TestFP8BlockScalingRecipeLinear(TestFP8RecipeLinearBase):
        out_size,
        dtype,
        use_bias=True,
+    ):
+        if not fp8_blockwise_scaling_supported():
+            pytest.skip("CUDA version does not support blockwise FP8.")
+        fp8_zero_tolerance_tensor_dumps_recipe2 = None
+        # check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
+        # if we cannot get all four tensors, then still set the tensor dump to None
+        tensor_map = self._check_golden_tensor_dumps(
+            TENSOR_DUMP_DIR, recipe2, (batch_size, hidden_size, out_size), dtype, use_bias
+        )
+        if tensor_map is not None:
+            fp8_zero_tolerance_tensor_dumps_recipe2 = tensor_map
+        self.compare_recipe(
+            recipe1,
+            recipe2,
+            batch_size,
+            hidden_size,
+            out_size,
+            use_bias,
+            seed=torch.initial_seed(),
+            dtype=dtype,
+            y_error=0.5,
+            dgrad_error=1,
+            wgrad_error=1,
+            bgrad_error=0.5,
+            recipe1_golden_tensors=None,
+            recipe2_golden_tensors=fp8_zero_tolerance_tensor_dumps_recipe2,
+        )
+    @pytest.mark.parametrize(
+        "batch_size, hidden_size, out_size",
+        [
+            (16, 256, 128),
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
+    @pytest.mark.parametrize(
+        "recipe1, recipe2",
+        [
+            (GetRecipes.none, GetRecipes.fp8_blockwise),
+        ],
+    )
+    def test_int8_current_scaling_with_linear_module(
+        self,
+        recipe1,
+        recipe2,
+        batch_size,
+        hidden_size,
+        out_size,
+        dtype,
+        use_bias=True,
    ):
        if IS_HIP_EXTENSION:
            import importlib
            ori_int8_sim_fp8 = os.environ.get("NVTE_INT8_SIM_FP8", None)
            os.environ["NVTE_INT8_SIM_FP8"] = "1"
            importlib.reload(te.pytorch.fp8)
+        if not fp8_blockwise_scaling_supported():
+            pytest.skip("CUDA version does not support blockwise FP8.")
        fp8_zero_tolerance_tensor_dumps_recipe2 = None
        # check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
        # if we cannot get all four tensors, then still set the tensor dump to None
@@ -612,12 +668,71 @@ class TestFP8BlockScalingRecipeLayerNormLinear(TestFP8RecipeLayerNormLinearBase)
        out_size,
        dtype,
        use_bias=True,
+    ):
+        if not fp8_blockwise_scaling_supported():
+            pytest.skip("CUDA version does not support blockwise FP8.")
+        fp8_zero_tolerance_tensor_dumps_recipe2 = None
+        # check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
+        # if we cannot get all four tensors, then still set the tensor dump to None
+        tensor_map = self._check_golden_tensor_dumps(
+            TENSOR_DUMP_DIR,
+            recipe2,
+            (batch_size, hidden_size, out_size),
+            dtype,
+            use_bias,
+            "LayerNorm",
+        )
+        if tensor_map is not None:
+            fp8_zero_tolerance_tensor_dumps_recipe2 = tensor_map
+        self.compare_recipe(
+            recipe1,
+            recipe2,
+            batch_size,
+            hidden_size,
+            out_size,
+            use_bias,
+            seed=torch.initial_seed(),
+            dtype=dtype,
+            y_error=0.5,
+            ln_out_error=0.5,
+            dgrad_error=1.6,
+            wgrad_error=1,
+            bgrad_error=0.5,
+            recipe1_golden_tensors=None,
+            recipe2_golden_tensors=fp8_zero_tolerance_tensor_dumps_recipe2,
+        )
+    @pytest.mark.parametrize(
+        "batch_size, hidden_size, out_size",
+        [
+            (16, 256, 128),
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
+    @pytest.mark.parametrize(
+        "recipe1, recipe2",
+        [
+            (GetRecipes.none, GetRecipes.fp8_blockwise),
+        ],
+    )
+    def test_int8_current_scaling_with_layernorm_linear_module(
+        self,
+        recipe1,
+        recipe2,
+        batch_size,
+        hidden_size,
+        out_size,
+        dtype,
+        use_bias=True,
    ):
        if IS_HIP_EXTENSION:
            import importlib
            ori_int8_sim_fp8 = os.environ.get("NVTE_INT8_SIM_FP8", None)
            os.environ["NVTE_INT8_SIM_FP8"] = "1"
            importlib.reload(te.pytorch.fp8)
+        if not fp8_blockwise_scaling_supported():
+            pytest.skip("CUDA version does not support blockwise FP8.")
        fp8_zero_tolerance_tensor_dumps_recipe2 = None
        # check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
        # if we cannot get all four tensors, then still set the tensor dump to None

--- a/tests/pytorch/test_int8_channelwise_gemm_exact.py
+++ b/tests/pytorch/test_int8_channelwise_gemm_exact.py
--- a/transformer_engine/pytorch/quantization.py
+++ b/transformer_engine/pytorch/quantization.py
@@ -15,6 +15,7 @@ from collections import deque
 from typing import Callable, List, Optional, Dict, Any, Tuple, Union
 import torch
+from torch.utils.cpp_extension import IS_HIP_EXTENSION
 import transformer_engine_torch as tex
 from transformer_engine.common.recipe import (
    Recipe,
@@ -27,10 +28,8 @@ from transformer_engine.common.recipe import (
    CustomRecipe,
 )
 from .constants import dist_group_type
+from .utils import (get_device_compute_capability, is_gfx928, is_gfx936, is_gfx938)
-from .utils import get_device_compute_capability
 from .jit import jit_fuser
-from torch.utils.cpp_extension import IS_HIP_EXTENSION
 int8_simulation_fp8 = bool(int(os.getenv("NVTE_INT8_SIM_FP8", "0")))
 int8_simulation_fp8_tensorwise = bool(int(os.getenv("NVTE_INT8_SIM_FP8_TENSORWISE", "0")))
 blockwise_fp8_block_len = int(os.getenv("NVTE_BLOCKWISE_FP8_BLOCK_LEN", "128"))
@@ -45,32 +44,30 @@ __all__ = [
    "get_default_recipe",
 ]
-if IS_HIP_EXTENSION:
-    from transformer_engine.pytorch.utils import is_K100_AI, is_BW
 @functools.lru_cache(maxsize=None)
 def check_fp8_support() -> Tuple[bool, str]:
    """Return if fp8 support is available"""
    if IS_HIP_EXTENSION:
-        if (is_K100_AI() or is_BW()) and  int8_simulation_fp8:
+        if is_gfx938():
-            return True, "DCU turn on fp8 simulation with int8"
+            return True, ""
-        else:
+        if (is_gfx928() or is_gfx936()) and int8_simulation_fp8 and int8_simulation_fp8_tensorwise:
-            return False, "DCU not support fp8 for now"
-    else:
-        if get_device_compute_capability() >= (9, 0):  # hopper and above
            return True, ""
-        if get_device_compute_capability() < (8, 9):  # pre-ada
+    if get_device_compute_capability() >= (9, 0):  # hopper and above
-            return False, "Device compute capability 8.9 or higher required for FP8 execution."
+        return True, ""
-        if tex.get_cublasLt_version() < 120103:
+    if get_device_compute_capability() < (8, 9):  # pre-ada
-            return False, "CublasLt version 12.1.3.x or higher required for FP8 execution on Ada."
+        return False, "Device compute capability 8.9 or higher required for FP8 execution."
-        if float(torch.version.cuda) < 12.1:
+    if tex.get_cublasLt_version() < 120103:
-            return False, "Cuda version 12.1 or higher required for FP8 execution on Ada."
+        return False, "CublasLt version 12.1.3.x or higher required for FP8 execution on Ada."
+    if float(torch.version.cuda) < 12.1:
+        return False, "Cuda version 12.1 or higher required for FP8 execution on Ada."
    return True, ""
 @functools.lru_cache(maxsize=None)
 def check_mxfp8_support() -> Tuple[bool, str]:
    """Return if fp8 support is available"""
+    if IS_HIP_EXTENSION:
+        return False, "DCU not support mxfp8 for now"
    if get_device_compute_capability() >= (12, 0):
        return False, "MXFP8 (for all gemm layouts) is not supported on 12.0+ architectures yet."
    if get_device_compute_capability() >= (10, 0):  # blackwell and above
@@ -83,9 +80,8 @@ def check_nvfp4_support() -> Tuple[bool, str]:
    """Return if nvfp4 support is available"""
    if IS_HIP_EXTENSION:
        return False, "NVFP4 is not supported on rocm platform."
-    else:
+    if get_device_compute_capability() >= (10, 0):  # blackwell and above
-        if get_device_compute_capability() >= (10, 0):  # blackwell and above
+        return True, ""
-            return True, ""
    return False, "Device compute capability 10.0 or higher required for NVFP4 execution."
@@ -93,10 +89,11 @@ def check_nvfp4_support() -> Tuple[bool, str]:
 def check_fp8_block_scaling_support() -> Tuple[bool, str]:
    """Return if fp8 block scaling support is available"""
    if IS_HIP_EXTENSION:
-        if is_K100_AI() or is_BW() and int8_simulation_fp8:
+        if is_gfx938():
            return True, ""
-        else:
+        if (is_gfx928() or is_gfx936()) and int8_simulation_fp8:
-            return False, "DCU not support block_scaling fp8 for now"
+            return True, ""
+        return False, "DCU not support block_scaling fp8 for now"
    if get_device_compute_capability() >= (9, 0) and float(torch.version.cuda) >= 12.9:
        return True, ""
    return (

--- a/transformer_engine/pytorch/utils.py
+++ b/transformer_engine/pytorch/utils.py
@@ -11,11 +11,10 @@ from typing import Any, Callable, List, Optional, Sequence, Tuple, Union
 from contextlib import nullcontext
 import numpy as np
 import torch
+from torch.utils.cpp_extension import IS_HIP_EXTENSION
-from .quantized_tensor import Quantizer
 from .torch_version import torch_version
+from .quantized_tensor import Quantizer
 from ..debug.pytorch.debug_quantization import DebugQuantizedTensor
-from torch.utils.cpp_extension import IS_HIP_EXTENSION
 __all__ = ["get_device_compute_capability", "get_cudnn_version", "is_bf16_available"]
@@ -447,20 +446,64 @@ def assert_dim_for_fp8_exec(*tensors: List[torch.Tensor]) -> None:
        )
 if IS_HIP_EXTENSION:
-    def is_mi200():
+    @functools.lru_cache(maxsize=None)
-      """check whether this machine is mi200/210/250"""
+    def _get_gcn_arch_impl(device: torch.device) -> int:
-      import re
+        props = torch.cuda.get_device_properties(device)
-      return (re.search('AMD Instinct MI2.0', torch.cuda.get_device_name(torch.cuda.current_device())) is not None)
+        import re
+        if re.search('gfx906', props.gcnArchName) is not None:
-    def is_K100_AI():
+            return 906
-      """check whether this machine is K100_AI"""
+        if re.search('gfx926', props.gcnArchName) is not None:
-      import re
+            return 926
-      return (re.search('K100_AI', torch.cuda.get_device_name(torch.cuda.current_device())) is not None)
+        if re.search('gfx928', props.gcnArchName) is not None:
+            return 928
-    def is_BW():
+        if re.search('gfx936', props.gcnArchName) is not None:
-      """check whether this machine is BW"""
+            return 936
-      import re
+        if re.search('gfx938', props.gcnArchName) is not None:
-      return (re.search('BW', torch.cuda.get_device_name(torch.cuda.current_device())) is not None)
+            return 938
+        raise RuntimeError(f"Unsupported GCN Arch {props.gcnArchName}")
+    def _get_gcn_arch() -> int:
+        return _get_gcn_arch_impl(torch.cuda.current_device())
+    def is_gfx906() -> bool:
+        """check whether this machine is gfx906"""
+        return _get_gcn_arch() == 906
+    def is_gfx926() -> bool:
+        """check whether this machine is gfx926"""
+        return _get_gcn_arch() == 926
+    def is_gfx928() -> bool:
+        """check whether this machine is gfx928"""
+        return _get_gcn_arch() == 928
+    def is_gfx936() -> bool:
+        """check whether this machine is gfx928"""
+        return _get_gcn_arch() == 936
+    def is_gfx938() -> bool:
+        """check whether this machine is gfx928"""
+        return _get_gcn_arch() == 938
+else:
+    def is_gfx906() -> bool:
+        """gfx906 is only available on ROCm"""
+        return False
+    def is_gfx926() -> bool:
+        """gfx926 is only available on ROCm"""
+        return False
+    def is_gfx928() -> bool:
+        """gfx928 is only available on ROCm"""
+        return False
+    def is_gfx936() -> bool:
+        """gfx936 is only available on ROCm"""
+        return False
+    def is_gfx938() -> bool:
+        """gfx938 is only available on ROCm"""
+        return False
 def assert_dim_for_all_gather(
    tensor: torch.Tensor, with_all_gather: bool, quantizer: Quantizer
@@ -477,13 +520,9 @@ def is_bf16_compatible() -> bool:
    check on device compute capability to enforce sm_80 or higher.
    """
    if IS_HIP_EXTENSION:
-        # only MI200 and MI300 machines support bf16
+        # only these arch support bf16
-        if get_device_compute_capability() >= (9, 4) or is_mi200() or is_K100_AI() or is_BW():
+        return is_gfx928() or is_gfx936() or is_gfx938()
-            return True
+    return torch.cuda.get_device_capability()[0] >= 8
-        else:
-            return False
-    else:
-        return torch.cuda.get_device_capability()[0] >= 8
 def is_bf16_available(return_reason: bool = False) -> Union[bool, Tuple[bool, str]]:
@@ -517,8 +556,7 @@ def is_non_tn_fp8_gemm_supported(is_blockwise: Optional[bool] = False) -> bool:
    if IS_HIP_EXTENSION:
        if is_blockwise:
            return False
-        else:
+        return True
-            return True
    device_capability = torch.cuda.get_device_capability()
    return (10, 0) <= device_capability < (12, 0) or device_capability >= (13, 0)