Fix swizzle, swap_first_dims and RMSNorm issues

Signed-off-by: wenjh <wenjh@sugon.com>

Fix swizzle, swap_first_dims and RMSNorm issues
Signed-off-by: wenjh <wenjh@sugon.com>
3e806729 · wenjh · 08f06b7a · 3e806729 · 3e806729 · 3e806729
Commit 3e806729 authored Jan 09, 2026 by wenjh
3 changed files
--- a/tests/cpp/operator/test_normalization.cu
+++ b/tests/cpp/operator/test_normalization.cu
@@ -43,9 +43,16 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
                 << "in fused norm backward+add";
  }
+#ifdef __HIP_PLATFORM_AMD__
+  if (use_cudnn) {
+    GTEST_SKIP() << "cuDNN normalizations not supported on pre-Hopper GPUs yet!";
+    return;
+  }
+#else
  if (getDeviceComputeCapability() < hopperComputeCapability && use_cudnn) {
    GTEST_SKIP() << "cuDNN normalizations not supported on pre-Hopper GPUs yet!";
  }
+#endif
  using WeightType = InputType;
  DType itype = TypeInfo<InputType>::dtype;

--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
@@ -536,7 +536,7 @@ else()
                              string_code_transpose_rtc_cast_transpose_cu)
  make_string_header_from_file(transpose/rtc/transpose.hip
                              string_code_transpose_rtc_transpose_cu)
-  make_string_header_from_file(transpose/rtc/swap_first_dims.cu
+  make_string_header_from_file(transpose/rtc/swap_first_dims.hip
                              string_code_transpose_rtc_swap_first_dims_cu)
 endif()

--- a/transformer_engine/common/swizzle/swizzle.cu
+++ b/transformer_engine/common/swizzle/swizzle.cu
@@ -21,6 +21,15 @@ namespace {
 constexpr int MXFP8_BLOCK_SIZE = 32;
 constexpr int NVFP4_BLOCK_SIZE = 16;
+#ifdef __HIP_PLATFORM_AMD__
+constexpr int TB_DIM = 32;
+constexpr int NEW_SF_TILE_DIM_K = 16;
+constexpr int N_SF_PER_TD_PER_TILE = 4;
+// output is in ~K-major interleaved blocks
+constexpr int NEW_SF_TILE_DIM_K_I32 = NEW_SF_TILE_DIM_K / 4;
+constexpr int NEW_SF_TILE_DIM_M_I32 = 32;
+#else
 constexpr __device__ __host__ int TB_DIM = 32;
 constexpr __device__ __host__ int NEW_SF_TILE_DIM_K = 16;
 constexpr __device__ __host__ int N_SF_PER_TD_PER_TILE = 4;
@@ -28,6 +37,7 @@ constexpr __device__ __host__ int N_SF_PER_TD_PER_TILE = 4;
 // output is in ~K-major interleaved blocks
 constexpr __device__ __host__ int NEW_SF_TILE_DIM_K_I32 = NEW_SF_TILE_DIM_K / 4;
 constexpr __device__ __host__ int NEW_SF_TILE_DIM_M_I32 = 32;
+#endif
 template <typename LType>
 __device__ inline void regs_shuffle_with_bit_shifts(LType* regs_vec) {