[DCU] surpport blockwise fp8 quantize

b8fe26e7 · yuguo · ab3e5a92 · b8fe26e7 · b8fe26e7 · b8fe26e7
Commit b8fe26e7 authored May 13, 2025 by yuguo
9 changed files
--- a/hipify_custom_map.json
+++ b/hipify_custom_map.json
 {
    "custom_map" : {
+           "common/utils.cuh" : "common/utils_hip.cuh",
+           "common/transpose/cast_transpose.h" : "common/transpose/cast_transpose_hip.h",
+           "common/recipe/recipe_common.cuh" : "common/recipe/recipe_common_hip.cuh",
+           "common/util/ptx.cuh" : "common/util/ptx_hip.cuh",
           "common/util/vectorized_pointwise.h" : "common/util/vectorized_pointwise_hip.h",
           "common/common.h" : "common/common_hip.h",
           "/userbuffers.h" : "/userbuffers_hip.h",

--- a/setup.py
+++ b/setup.py
@@ -131,10 +131,10 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
    if not bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))):
        if "pytorch" in frameworks:
            install_reqs.extend(["torch>=2.1"])
-            install_reqs.append(
-                "nvdlfw-inspect @"
-                " git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git@v0.1#egg=nvdlfw-inspect"
-            )
+            # install_reqs.append(
+            #     "nvdlfw-inspect @"
+            #     " git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git@v0.1#egg=nvdlfw-inspect"
+            # )
            # Blackwell is not supported as of Triton 3.2.0, need custom internal build
            # install_reqs.append("triton")
            test_reqs.extend(["numpy", "torchvision", "prettytable", "PyYAML"])

--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -2,7 +2,7 @@
 #
 # See LICENSE for license information.

-# CXX=hipcc make build && cd build && cmake ../
+# mkdir build && cd build && CXX=hipcc cmake ../
 cmake_minimum_required(VERSION 3.18)

 option(USE_CUDA "Use CUDA" ON)

--- a/tests/cpp/operator/CMakeLists.txt
+++ b/tests/cpp/operator/CMakeLists.txt
@@ -11,7 +11,7 @@ list(APPEND test_cuda_sources
            test_cast_mxfp8_gated_swiglu.cu
            test_qdq.cu
            test_cast_mxfp8.cu
-            # test_cast_float8blockwise.cu
+            test_cast_float8blockwise.cu
            test_dequantize_mxfp8.cu
            test_transpose.cu
            test_cast_transpose.cu

--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
@@ -163,8 +163,8 @@ else()
       transpose/cast_transpose_fusion.cu
       transpose/transpose_fusion.cu
       transpose/multi_cast_transpose.cu
-      #  transpose/quantize_transpose_square_blockwise.cu
-      #  transpose/quantize_transpose_vector_blockwise.cu
+       transpose/quantize_transpose_square_blockwise.cu
+       transpose/quantize_transpose_vector_blockwise.cu
       activation/gelu.cu
       activation/relu.cu
       activation/swiglu.cu

--- a/transformer_engine/common/common.h
+++ b/transformer_engine/common/common.h
@@ -294,7 +294,11 @@ struct TypeExtrema;

 template <>
 struct TypeExtrema<fp8e4m3> {
+#ifndef __HIP_PLATFORM_AMD__
  static constexpr float max = 448.0f;
+#else
+  static constexpr float max = 240.0f;
+#endif
 };

 template <>

--- a/transformer_engine/common/transpose/quantize_transpose_square_blockwise.cu
+++ b/transformer_engine/common/transpose/quantize_transpose_square_blockwise.cu
@@ -5,12 +5,16 @@
 ************************************************************************/

 #include <cuda.h>
+#ifndef __HIP_PLATFORM_AMD__
 #include <cudaTypedefs.h>
+#endif
 #include <cuda_bf16.h>
 #include <cuda_runtime.h>

 #include <cfloat>
+#ifndef __HIP_PLATFORM_AMD__
 #include <cuda/barrier>
+#endif

 #include "common/common.h"
 #include "common/recipe/recipe_common.cuh"
@@ -69,7 +73,9 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK)
                                       const size_t num_rows, const size_t scale_stride_x,
                                       const size_t scale_stride_y, const size_t scale_t_stride_x,
                                       const size_t scale_t_stride_y, const float epsilon,
+#ifndef __HIP_PLATFORM_AMD__
                                       const __grid_constant__ CUtensorMap tensor_map_output_t,
+#endif
                                       bool pow_2_scaling) {
  using IVec = Vec<IType, THREAD_TILE_DIM_X>;
  using OVecCast = Vec<OType, THREAD_TILE_DIM_X>;
@@ -128,7 +134,11 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK)
  warp_tile_amax = warp_reduce_max<kThreadsPerWarp>(amax);
  // broadcast the amax to all threads in a warp from the lane 0
  constexpr int lane_zero = 0;
+#ifdef __HIP_PLATFORM_AMD__
+  warp_tile_amax = __shfl(warp_tile_amax, lane_zero);
+#else
  warp_tile_amax = __shfl_sync(0xFFFFFFFF, warp_tile_amax, lane_zero);
+#endif

  // reduce warp_tile_amax across multiple warps in a thread block using shared mem
  if (tid_in_warp == 0) {
@@ -351,7 +361,11 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK) block_scaled_cast_transpose
  warp_tile_amax = warp_reduce_max<kThreadsPerWarp>(amax);
  // broadcast the amax to all threads in a warp from the lane 0
  constexpr int lane_zero = 0;
+#ifdef __HIP_PLATFORM_AMD__
+  warp_tile_amax = __shfl(warp_tile_amax, lane_zero);
+#else
  warp_tile_amax = __shfl_sync(0xFFFFFFFF, warp_tile_amax, lane_zero);
+#endif

  // reduce warp_tile_amax across multiple warps in a thread block using shared mem
  if (tid_in_warp == 0) {
@@ -447,6 +461,7 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK) block_scaled_cast_transpose
  }
 }

+#ifndef __HIP_PLATFORM_AMD__
 template <typename OutputType>
 CUtensorMap get_tensor_map(const SimpleTensor& tensor, size_t global_dim_x, size_t global_dim_y) {
  CUtensorMapDataType dataType;
@@ -463,6 +478,7 @@ CUtensorMap get_tensor_map(const SimpleTensor& tensor, size_t global_dim_x, size
                       /*stride_elems=*/global_dim_x, /*offset_elems=*/0, sizeof(OutputType));
  return tensor_map_output_trans;
 }
+#endif

 }  // namespace
 }  // namespace transformer_engine
@@ -526,6 +542,7 @@ void quantize_transpose_square_blockwise(const SimpleTensor& input, SimpleTensor
                  row_length % BLOCK_TILE_DIM == 0 && num_rows % BLOCK_TILE_DIM == 0;

              if (full_tile) {
+#ifndef __HIP_PLATFORM_AMD__
                CUtensorMap tensor_map_output_trans;
                if (return_transpose) {
                  tensor_map_output_trans =
@@ -540,6 +557,17 @@ void quantize_transpose_square_blockwise(const SimpleTensor& input, SimpleTensor
                        reinterpret_cast<float*>(scale_inv_t.dptr), row_length, num_rows,
                        scale_stride_x, scale_stride_y, scale_t_stride_x, scale_t_stride_y, epsilon,
                        tensor_map_output_trans, pow_2_scale);
+#else
+                block_scaled_cast_transpose_kernel<kReturnTranspose, float, InputType, OutputType>
+                    <<<grid, THREADS_PER_BLOCK, 0, stream>>>(
+                        reinterpret_cast<const InputType*>(input.dptr),
+                        reinterpret_cast<OutputType*>(output.dptr),
+                        reinterpret_cast<OutputType*>(output_t.dptr),
+                        reinterpret_cast<float*>(scale_inv.dptr),
+                        reinterpret_cast<float*>(scale_inv_t.dptr), row_length, num_rows,
+                        scale_stride_x, scale_stride_y, scale_t_stride_x, scale_t_stride_y, epsilon,
+                        pow_2_scale);
+#endif
              } else {
                block_scaled_cast_transpose_kernel_notaligned<kReturnTranspose, float, InputType,
                                                              OutputType>

--- a/transformer_engine/common/transpose/quantize_transpose_vector_blockwise.cu
+++ b/transformer_engine/common/transpose/quantize_transpose_vector_blockwise.cu
@@ -5,13 +5,17 @@
 ************************************************************************/

 #include <cuda.h>
+#ifndef __HIP_PLATFORM_AMD__
 #include <cudaTypedefs.h>
+#endif
 #include <cuda_bf16.h>
 #include <cuda_runtime.h>

 #include <algorithm>
 #include <cfloat>
+#ifndef __HIP_PLATFORM_AMD__
 #include <cuda/barrier>
+#endif
 #include <utility>

 #include "common/common.h"
@@ -252,12 +256,20 @@ __global__ void __launch_bounds__(kThreadsPerBlock) block_scaled_1d_cast_transpo
      // Step 2.3: Reduce amax
 #pragma unroll
      for (int delta = kNumThreadsStore / 2; delta > 0; delta /= 2) {
+#ifdef __HIP_PLATFORM_AMD__
+        const float other_amax = __shfl_down(amax, delta);
+#else
        const float other_amax = __shfl_down_sync(mask, amax, delta);
+#endif
        __builtin_assume(amax >= 0);
        __builtin_assume(other_amax >= 0);
        amax = fmaxf(amax, other_amax);
      }
+#ifdef __HIP_PLATFORM_AMD__      
+      amax = __shfl(amax, src_lane);
+#else
      amax = __shfl_sync(mask, amax, src_lane);
+#endif
      CType scale;
      // Step 2.4: Compute scale
      scale = compute_scale_from_types<IType, OType>(amax, epsilon, pow_2_scaling);
@@ -341,12 +353,20 @@ __global__ void __launch_bounds__(kThreadsPerBlock) block_scaled_1d_cast_transpo
        // Step 3.3: Reduce amax
 #pragma unroll
        for (int delta = kNumThreadsStore / 2; delta > 0; delta /= 2) {
+#ifdef __HIP_PLATFORM_AMD__  
+          const float other_amax = __shfl_down(amax, delta);
+#else
          const float other_amax = __shfl_down_sync(mask, amax, delta);
+#endif
          __builtin_assume(amax >= 0);
          __builtin_assume(other_amax >= 0);
          amax = fmaxf(amax, other_amax);
        }
+#ifdef __HIP_PLATFORM_AMD__  
+        amax = __shfl(amax, src_lane);
+#else
        amax = __shfl_sync(mask, amax, src_lane);
+#endif
        // Step 3.4: Compute scale
        CType scale;
        scale = compute_scale_from_types<IType, OType>(amax, epsilon, pow_2_scaling);
@@ -472,9 +492,15 @@ void quantize_transpose_vector_blockwise(const SimpleTensor& input, SimpleTensor
              size_t smem_bytes = kSMemSize * sizeof(InputType);
              // shared memory must be requested up
              if (smem_bytes >= 48 * 1024) {
+#ifdef __HIP_PLATFORM_AMD__
+                cudaError_t err = cudaFuncSetAttribute(
+                    (const void *)&block_scaled_1d_cast_transpose_kernel<kAligned, float, InputType, OutputType>,
+                    cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+#else
                cudaError_t err = cudaFuncSetAttribute(
                    &block_scaled_1d_cast_transpose_kernel<kAligned, float, InputType, OutputType>,
                    cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+#endif
                NVTE_CHECK(err == cudaSuccess, "Failed to set dynamic shared memory size.");
              } block_scaled_1d_cast_transpose_kernel<kAligned, float, InputType, OutputType>
              <<<grid, kThreadsPerBlock, smem_bytes, stream>>>(

--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -180,7 +180,7 @@ def initialize_ub(
                        which also requires `MPI_HOME=/path/to/mpi/root` to be set at compile time.
    """
    if not tex.device_supports_multicast():
-        assert bool(int(os.getenv("UB_SKIPMC", "0"))), (
+        assert bool(int(os.getenv("UB_SKIPMC", "1"))), (
            "CUDA device, driver and/or toolkit version does not support comm+GEMM overlap with "
            + "CUDA Multicast. Launch app with UB_SKIPMC=1 to try CUDA IPC instead."
        )