Merge branch 'develop_v2.9' into release_v2.9

33062330 · wenjh · adb1e9c5 · b3dcfc28 · 33062330 · 33062330
Commit 33062330 authored Dec 03, 2025 by wenjh
10 changed files
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -101,9 +101,9 @@ struct BitsNumber {
 template <typename T>
 struct TypeInfo {
 #if FP4_TYPE_SUPPORTED
-    using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, fp8e8m0, fp4e2m1, int8>;
+    using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, int8, fp8e8m0, fp4e2m1>;
 #else
-    using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, fp8e8m0, int8>;
+    using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, int8, fp8e8m0>;
 #endif
    template <typename U, DType current>

--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
@@ -155,26 +155,28 @@ set(CUTLASS_TOOLS_INCLUDE_DIR
 # Python
 find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
-# NVIDIA MathDX include directory (from Python package install location)
+if(USE_CUDA)
-if(NOT DEFINED MATHDX_INCLUDE_DIR)
+  # NVIDIA MathDX include directory (from Python package install location)
-  execute_process(
+  if(NOT DEFINED MATHDX_INCLUDE_DIR)
-    COMMAND ${Python_EXECUTABLE} -m pip show nvidia-mathdx
+    execute_process(
-    OUTPUT_VARIABLE _PIP_SHOW_MATHDX
+      COMMAND ${Python_EXECUTABLE} -m pip show nvidia-mathdx
-    ERROR_VARIABLE _PIP_SHOW_MATHDX_ERR
+      OUTPUT_VARIABLE _PIP_SHOW_MATHDX
-    RESULT_VARIABLE _PIP_SHOW_MATHDX_RES
+      ERROR_VARIABLE _PIP_SHOW_MATHDX_ERR
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
+      RESULT_VARIABLE _PIP_SHOW_MATHDX_RES
-  if(NOT _PIP_SHOW_MATHDX_RES EQUAL 0)
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
-    message(FATAL_ERROR "Failed to query 'nvidia-mathdx' with pip (using ${Python_EXECUTABLE}): ${_PIP_SHOW_MATHDX_ERR}")
+    if(NOT _PIP_SHOW_MATHDX_RES EQUAL 0)
+      message(FATAL_ERROR "Failed to query 'nvidia-mathdx' with pip (using ${Python_EXECUTABLE}): ${_PIP_SHOW_MATHDX_ERR}")
+    endif()
+    string(REGEX MATCH "Location: ([^\n\r]+)" _MATHDX_LOC_MATCH "${_PIP_SHOW_MATHDX}")
+    if(NOT _MATHDX_LOC_MATCH)
+      message(FATAL_ERROR "Could not parse installation location for 'nvidia-mathdx'. Output was:\n${_PIP_SHOW_MATHDX}")
+    endif()
+    set(MATHDX_LOCATION "${CMAKE_MATCH_1}")
+    set(MATHDX_INCLUDE_DIR "${MATHDX_LOCATION}/nvidia/mathdx/include")
  endif()
-  string(REGEX MATCH "Location: ([^\n\r]+)" _MATHDX_LOC_MATCH "${_PIP_SHOW_MATHDX}")
+  if(NOT EXISTS "${MATHDX_INCLUDE_DIR}")
-  if(NOT _MATHDX_LOC_MATCH)
+    message(FATAL_ERROR "MATHDX include directory not found at ${MATHDX_INCLUDE_DIR}. Set MATHDX_INCLUDE_DIR or ensure 'nvidia-mathdx' is installed for ${Python_EXECUTABLE}.")
-    message(FATAL_ERROR "Could not parse installation location for 'nvidia-mathdx'. Output was:\n${_PIP_SHOW_MATHDX}")
  endif()
-  set(MATHDX_LOCATION "${CMAKE_MATCH_1}")
-  set(MATHDX_INCLUDE_DIR "${MATHDX_LOCATION}/nvidia/mathdx/include")
-endif()
-if(NOT EXISTS "${MATHDX_INCLUDE_DIR}")
-  message(FATAL_ERROR "MATHDX include directory not found at ${MATHDX_INCLUDE_DIR}. Set MATHDX_INCLUDE_DIR or ensure 'nvidia-mathdx' is installed for ${Python_EXECUTABLE}.")
 endif()
 # Configure Transformer Engine library

--- a/transformer_engine/common/common.h
+++ b/transformer_engine/common/common.h
@@ -417,11 +417,13 @@ struct BitsNumber {
 template <typename T>
 struct TypeInfo {
 #if FP4_TYPE_SUPPORTED
-  using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, int8, fp4e2m1
+  using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, int8
 #if CUDA_VERSION >= 12080
                           ,
                           fp8e8m0
 #endif
+                           ,
+                           fp4e2m1
                           >;
 #else
  using types = std::tuple<byte, int16, int32, int64, fp32, fp16, bf16, fp8e4m3, fp8e5m2, int8

--- a/transformer_engine/common/gemm/cublaslt_gemm.cu
+++ b/transformer_engine/common/gemm/cublaslt_gemm.cu
@@ -1175,9 +1175,6 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor
  const Tensor *inputCounter = convertNVTETensor(counter);
  Tensor *wspace = convertNVTETensor(workspace);
-  const void *alpha_ptr = GetScalarOne();
-  const void *beta_ptr = accumulate ? GetScalarOne() : GetScalarZero();
  NVTE_CHECK(is_delayed_tensor_scaling(inputA->scaling_mode) &&
                 is_delayed_tensor_scaling(inputB->scaling_mode),
             "Atomic GEMM only supports delayed scaling.");
@@ -1230,6 +1227,8 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor
                 stream);
  }
 #else 
+    const void *alpha_ptr = GetScalarOne();
+    const void *beta_ptr = accumulate ? GetScalarOne() : GetScalarZero();
    cublas_gemm(inputA, inputB, outputD, biasTensor, outputGelu, (transa) ? CUBLAS_OP_T : CUBLAS_OP_N,
              (transb) ? CUBLAS_OP_T : CUBLAS_OP_N, grad, wspace->data.dptr, wspace->data.shape[0],
              alpha_ptr, beta_ptr, use_split_accumulator, math_sm_count, m_split, n_split,

--- a/transformer_engine/common/include/transformer_engine/transformer_engine.h
+++ b/transformer_engine/common/include/transformer_engine/transformer_engine.h
@@ -14,8 +14,6 @@
 #include <cuda_runtime_api.h>
 #include <stddef.h>
-#define TE_FP4_TYPE_SUPPORTED (CUDA_VERSION >= 12080)
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -33,13 +31,9 @@ enum NVTEDType {
  kNVTEBFloat16 = 6,    /*!< 16-bit bfloat (E8M7) */
  kNVTEFloat8E4M3 = 7,  /*!< 8-bit float (E4M3) */
  kNVTEFloat8E5M2 = 8,  /*!< 8-bit float (E5M2) */
-  kNVTEFloat8E8M0 = 9,  /*!< 8-bit float (E8M0) */
+  kNVTEInt8 = 9,        /*!< 8-bit integer */
-  #if TE_FP4_TYPE_SUPPORTED
+  kNVTEFloat8E8M0 = 10,  /*!< 8-bit float (E8M0) */
-  kNVTEFloat4E2M1 = 10, /*!< 4-bit float (E2M1) */
+  kNVTEFloat4E2M1 = 11, /*!< 4-bit float (E2M1) */
-  kNVTEInt8 = 11,        /*!< 8-bit integer */
-  #else
-  kNVTEInt8 = 10,        /*!< 8-bit integer */
-  #endif 
  kNVTENumTypes         /*!< Number of supported types */
 };
@@ -423,13 +417,9 @@ enum class DType {
  kBFloat16 = 6,
  kFloat8E4M3 = 7,
  kFloat8E5M2 = 8,
-  kFloat8E8M0 = 9,
+  kInt8 = 9,
-  #if TE_FP4_TYPE_SUPPORTED
+  kFloat8E8M0 = 10,
-  kFloat4E2M1 = 10,
+  kFloat4E2M1 = 11,
-  kInt8 = 11,
-  #else
-  kInt8 = 10,
-  #endif
  kNumTypes
 };
@@ -457,11 +447,7 @@ inline bool is_fp8_dtype(const DType t) {
 *  \param[in] DType      TE Datatype of interest
 */
 inline bool is_fp4_dtype(const DType t) {
-  #if TE_FP4_TYPE_SUPPORTED
  return t == DType::kFloat4E2M1;
-  #else
-  return false;
-  #endif
 }
 /*! \brief Check if TE datatype is high precision (FP32, FP16, BF16)

--- a/transformer_engine/common/transpose/quantize_transpose_vector_blockwise_fp4.cu
+++ b/transformer_engine/common/transpose/quantize_transpose_vector_blockwise_fp4.cu
@@ -5,13 +5,21 @@
 ************************************************************************/
 #include <cuda.h>
+#ifndef __HIP_PLATFORM_AMD__
 #include <cudaTypedefs.h>
+#else
+#define CUDA_VERSION 0
+#endif
 #include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include <algorithm>
 #include <cfloat>
+#ifndef __HIP_PLATFORM_AMD__
 #include <cuda/barrier>
+#endif
 #include <utility>
 #include "common/common.h"
@@ -19,7 +27,10 @@
 #include "common/transpose/cast_transpose.h"
 #include "common/util/ptx.cuh"
 #include "common/utils.cuh"
+#ifndef __HIP_PLATFORM_AMD__
 #include "curanddx.hpp"
+#endif
 namespace transformer_engine {

--- a/transformer_engine/common/util/cast_kernels.cuh
+++ b/transformer_engine/common/util/cast_kernels.cuh
@@ -576,6 +576,7 @@ __device__ __forceinline__ fp8e4m3 compute_decoding_scaling_factor(const float b
 #define DIRECT_SCALING_FACTORS_STORE 1
+#ifndef __HIP_PLATFORM_AMD__
 template <bool COMPUTE_ACTIVATIONS, typename ParamOP, float (*OP)(float, const ParamOP &),
          typename IType, typename OType, bool COLWISE_SCALING, size_t CHUNK_DIM_Y,
          size_t CHUNK_DIM_X, size_t THREADS_PER_CHUNK>
@@ -1065,6 +1066,7 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
  destroy_barriers<STAGES>(mbar, is_master_thread);
 #endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
+#endif
 }  // namespace nvfp4_kernel
 constexpr size_t FP8_CHUNK_DIM_Y = 128;
@@ -1725,6 +1727,11 @@ void mxfp8_quantize(const Tensor &input, const Tensor *act_input,
 // 2. r16c32 - Rowwise NVFP4 AND Colwise MXFP8
 template <bool COMPUTE_ACTIVATIONS, typename ParamOP, float (*OP)(float, const ParamOP &)>
 void nvfp4_quantize(const Tensor &input, const Tensor *noop, Tensor *output, cudaStream_t stream) {
+#ifdef __HIP_PLATFORM_AMD__
+  assert(false);
+#else
  using namespace nvfp4_kernel;
  using namespace ptx;
  checkCuDriverContext(stream);
@@ -1853,6 +1860,7 @@ void nvfp4_quantize(const Tensor &input, const Tensor *noop, Tensor *output, cud
              break;
          });  // NOLINT(*)
  );           // NOLINT(*)
+#endif
 }
 namespace detail {

--- a/transformer_engine/common/util/logging.h
+++ b/transformer_engine/common/util/logging.h
@@ -23,7 +23,9 @@
 #endif // __HIP_PLATFORM_AMD__
 #include <nvrtc.h>
+#ifndef __HIP_PLATFORM_AMD__
 #include "nccl.h"
+#endif
 #ifdef NVTE_WITH_CUBLASMP
 #include <cublasmp.h>

--- a/transformer_engine/common/util/nvfp4_transpose.cuh
+++ b/transformer_engine/common/util/nvfp4_transpose.cuh
@@ -12,7 +12,13 @@
 #define TRANSFORMER_ENGINE_NVFP4_TRANSPOSE_CUH_
 #include <cuda.h>
+#ifndef __HIP_PLATFORM_AMD__
 #include <cudaTypedefs.h>
+#else
+#define CUDA_VERSION 0
+#endif
 #include <cuda_runtime.h>
 #if FP4_TYPE_SUPPORTED
@@ -22,7 +28,11 @@
 #include "../common.h"
 #include "../utils.cuh"
+#ifndef __HIP_PLATFORM_AMD__
 #include "curanddx.hpp"
+#endif
 #include "math.h"
 #include "ptx.cuh"
 #include "transformer_engine/transformer_engine.h"

--- a/transformer_engine/pytorch/csrc/quantizer.cpp
+++ b/transformer_engine/pytorch/csrc/quantizer.cpp
@@ -1486,10 +1486,14 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou
      // We need:
      // 1. Rowwise amax = amax for input
      // 2. Columnwise amax = amax for RHT(input.t)
+#ifdef __HIP_PLATFORM_AMD__
+      NVTE_CHECK(false, "Not only supported for nvte_hadamard_transform_amax");
+#else
      NVTE_SCOPED_GIL_RELEASE({
        nvte_hadamard_transform_amax(input.data(), out.data(), 0,
                                     this->rht_matrix_random_sign_mask_t, stream);
      });
+#endif
    } else {
      // raise error since it's not supported yet
      NVTE_CHECK(false, "Pre-RHT amax is not supported yet");
@@ -1612,11 +1616,15 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou
        rht_output_t_cpp.set_rowwise_data(rht_output_t.data_ptr(), input.dtype(),
                                          std::vector<size_t>{cols, rows});
+#ifdef __HIP_PLATFORM_AMD__
+        NVTE_CHECK(false, "Not only supported for nvte_hadamard_transform");
+#else
        NVTE_SCOPED_GIL_RELEASE({
          // Perform the RHT(input.t), and write to rht_output_cpp.columnwise.
          nvte_hadamard_transform(input.data(), rht_output_t_cpp.data(), 0,
                                  this->rht_matrix_random_sign_mask_t, stream);
        });
+#endif
        // Quantize kernel will treat everything as rowwise input/output, which is
        // intended.
@@ -1628,10 +1636,14 @@ void NVFP4Quantizer::quantize_impl(const TensorWrapper& input, TensorWrapper& ou
        NVTE_CHECK(this->rht_matrix.defined() && this->rht_matrix.numel() > 0,
                   "RHT matrix is not set");
        auto rht_matrix_nvte = makeTransformerEngineTensor(this->rht_matrix);
+#ifdef __HIP_PLATFORM_AMD__
+        NVTE_CHECK(false, "Not only supported for nvte_hadamard_transform_cast_fusion_columnwise");
+#else
        NVTE_SCOPED_GIL_RELEASE({
          nvte_hadamard_transform_cast_fusion_columnwise(
              input.data(), out_transpose.data(), rht_matrix_nvte.data(), quant_config, stream);
        });
+#endif
      }
    }
  } else {