Merge branch 'develop_v2.10' into release_v2.10

8f468f42 · wenjh · e8c89225 · 953b6d68 · 8f468f42 · 8f468f42
Commit 8f468f42 authored Jan 08, 2026 by wenjh
5 changed files
--- a/qa/L0_pytorch_unittest/test.sh
+++ b/qa/L0_pytorch_unittest/test.sh
@@ -23,6 +23,7 @@ set -x
 mkdir -p "$XML_LOG_DIR"
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
+pip3 install expecttest || error_exit "Failed to install expecttest"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "test_sanity.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_PATH/tests/pytorch/test_recipe.py || test_fail "test_recipe.py"
@@ -37,8 +38,8 @@ python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8blockwiset
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_scaling_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_scaling_exact.py || test_fail "test_float8_blockwise_scaling_exact.py"
 NVTE_INT8_SIM_FP8=1 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_gemm_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py || test_fail "test_float8_blockwise_gemm_exact.py"
 # channelwise int8 test
-NVTE_INT8_SIM_FP8=1 python3 -m pytest -v -s test_float8_current_scaling_exact.py
+NVTE_INT8_SIM_FP8=1 python3 -m pytest -v -s --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_current_scaling_exact.xml  $TE_PATH/tests/pytorch/test_float8_current_scaling_exact.py
-NVTE_INT8_SIM_FP8=1 NVTE_INT8_SIM_FP8_TENSORWISE=1 python3 -m pytest -v -s test_float8_current_scaling_exact.py
+NVTE_INT8_SIM_FP8=1 NVTE_INT8_SIM_FP8_TENSORWISE=1 python3 -m pytest -v -s --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_current_scaling_exact.xml $TE_PATH/tests/pytorch/test_float8_current_scaling_exact.py
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py || test_fail "test_gqa.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py || test_fail "test_fused_optimizer.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_multi_tensor.xml $TE_PATH/tests/pytorch/test_multi_tensor.py || test_fail "test_multi_tensor.py"

--- a/qa/L1_pytorch_distributed_unittest/test.sh
+++ b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -31,6 +31,7 @@ pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/distributed/test_sanity.py || test_fail "test_sanity.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "test_numerics.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_exact.xml $TE_PATH/tests/pytorch/distributed/test_numerics_exact.py || test_fail "test_numerics_exact.py"
+rm -rf /tmp/rdzv*
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py || test_fail "test_fusible_ops.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py || test_fail "test_torch_fsdp2.py"
 python3 -m pytest -v -s --log-cli-level=INFO --junitxml=$XML_LOG_DIR/pytest_test_comm_gemm_overlap.xml $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py || test_fail "test_comm_gemm_overlap.py"

--- a/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py
+++ b/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py
@@ -192,6 +192,10 @@ def _test_linear(
    dtype = model_config.dtype
    quantization = model_config.quantization
    quantized_compute = quantization is not None
+    # Userbuffers does not support row parallel mode with bias
+    # Skip this unsupported combination
+    if tensor_parallel_mode == "row" and bias:
+        return
    # Distributed process group
    process_group = world_group()

--- a/tests/pytorch/test_numerics.py
+++ b/tests/pytorch/test_numerics.py
@@ -627,12 +627,8 @@ def _test_e2e_selective_recompute(
 def test_gpt_selective_activation_recompute(dtype, bs, model, fp8, recipe, fp8_model_params):
    if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
        pytest.skip("FP8 parameters are not supported in debug mode.")
-    if recipe.float8_block_scaling() and not fp8_block_scaling_available:
-        pytest.skip(reason_for_no_fp8_block_scaling)
    if fp8 and not fp8_available:
        pytest.skip(reason_for_no_fp8)
-    if recipe.mxfp8() and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
    if fp8 and recipe.nvfp4():
        if dtype not in get_nvfp4_inp_supported_dtypes(recipe, dtype):
@@ -749,12 +745,8 @@ def test_gpt_full_activation_recompute(
 ):
    if fp8 and not fp8_available:
        pytest.skip(reason_for_no_fp8)
-    if recipe.mxfp8() and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
    if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
        pytest.skip("FP8 parameters are not supported in debug mode.")
-    if recipe.float8_block_scaling() and not fp8_block_scaling_available:
-        pytest.skip(reason_for_no_fp8_block_scaling)
    if fp8 and recipe.nvfp4():
        if dtype not in get_nvfp4_inp_supported_dtypes(recipe, dtype):
@@ -1340,10 +1332,6 @@ def test_linear_accuracy_save_original_input(dtype, model, recipe):
        pytest.skip(reason_for_no_fp8)
    if fp8 and recipe.delayed():
        pytest.skip("DelayedScaling recipe is not supported with save_original_input")
-    if fp8 and recipe.mxfp8() and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    if fp8 and recipe.float8_block_scaling() and not fp8_block_scaling_available:
-        pytest.skip(reason_for_no_fp8_block_scaling)
    config = model_configs[model]
    if config.max_seqlen_q % 16 != 0 and fp8:
@@ -1866,10 +1854,6 @@ def test_grouped_linear_accuracy(
    fp8 = recipe is not None
    if fp8 and not fp8_available:
        pytest.skip(reason_for_no_fp8)
-    if fp8 and recipe.mxfp8() and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    if fp8 and recipe.float8_block_scaling() and not fp8_block_scaling_available:
-        pytest.skip(reason_for_no_fp8_block_scaling)
    if fp8 and fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
        pytest.skip("FP8 parameters are not supported in debug mode.")
@@ -2018,10 +2002,6 @@ def test_grouped_linear_accuracy_save_original_input(
        pytest.skip("DelayedScaling recipe is not supported with save_original_input")
    if fp8 and not fp8_available:
        pytest.skip(reason_for_no_fp8)
-    if fp8 and recipe.mxfp8() and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    if fp8 and recipe.float8_block_scaling() and not fp8_block_scaling_available:
-        pytest.skip(reason_for_no_fp8_block_scaling)
    config = model_configs[model]
    if config.max_seqlen_q % 16 != 0 and fp8:
@@ -2236,12 +2216,8 @@ def test_padding_grouped_linear_accuracy(
 ):
    if fp8 and not fp8_available:
        pytest.skip(reason_for_no_fp8)
-    if recipe.mxfp8() and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
    if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
        pytest.skip("FP8 parameters are not supported in debug mode.")
-    if recipe.float8_block_scaling() and not fp8_block_scaling_available:
-        pytest.skip(reason_for_no_fp8_block_scaling)
    config = model_configs[model]
    if config.max_seqlen_q % 16 != 0 and fp8:
@@ -2321,10 +2297,6 @@ def test_padding_grouped_linear_accuracy_save_original_input(
        pytest.skip("DelayedScaling recipe is not supported with save_original_input")
    if fp8 and not fp8_available:
        pytest.skip(reason_for_no_fp8)
-    if recipe.mxfp8() and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    if recipe.float8_block_scaling() and not fp8_block_scaling_available:
-        pytest.skip(reason_for_no_fp8_block_scaling)
    config = model_configs[model]
    if config.max_seqlen_q % 16 != 0 and fp8:
@@ -2544,12 +2516,8 @@ def _test_gpt_fp8_parameters(bs, dtype, config, fp8_model_params, recipe):
 def test_gpt_fp8_parameters(dtype, bs, model, recipe):
    if not fp8_available:
        pytest.skip(reason_for_no_fp8)
-    if recipe.mxfp8() and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
    if NVTE_TEST_NVINSPECT_ENABLED:
        pytest.skip("FP8 parameters are not supported in debug mode.")
-    if recipe.float8_block_scaling() and not fp8_block_scaling_available:
-        pytest.skip(reason_for_no_fp8_block_scaling)
    if recipe.nvfp4():
        if dtype not in get_nvfp4_inp_supported_dtypes(recipe, dtype):

--- a/transformer_engine/common/transpose/quantize_transpose_vector_blockwise.cu
+++ b/transformer_engine/common/transpose/quantize_transpose_vector_blockwise.cu
@@ -1567,11 +1567,12 @@ void quantize_transpose_vector_blockwise(const SimpleTensor& input, SimpleTensor
      input.dtype, InputType,
      TRANSFORMER_ENGINE_TYPE_SWITCH_8BIT(
-          output.dtype, OutputType,
 #ifdef __HIP_PLATFORM_AMD__
+          output_dtype, OutputType,
          dim3 grid(num_blocks_x, num_blocks_y, 1);
          const bool full_tile = row_length % block_len == 0 && num_rows % block_len == 0;
 #else
+          output.dtype, OutputType,
          dim3 grid(num_blocks_x, num_blocks_y, 1);
          const bool full_tile = row_length % kTileDim == 0 && num_rows % kTileDim == 0;
 #endif