[DCU] Fix crash test cases

Due to the compiler compiling incorrect code. The following test case crashed: * OperatorTest/CTTestSuite.TestCastTranspose/bfloat16Xbfloat16X2048X12288 * OperatorTest/CTTestSuite.TestCastTranspose/bfloat16Xbfloat16X65536X128 * OperatorTest/CTTestSuite.TestCastTranspose/bfloat16Xbfloat16X256X65536 This commit is intended to fix these test cases. Signed-off-by: wenjh <wenjh@sugon.com>

[DCU] Fix crash test cases
Due to the compiler compiling incorrect code. The following test case crashed: * OperatorTest/CTTestSuite.TestCastTranspose/bfloat16Xbfloat16X2048X12288 * OperatorTest/CTTestSuite.TestCastTranspose/bfloat16Xbfloat16X65536X128 * OperatorTest/CTTestSuite.TestCastTranspose/bfloat16Xbfloat16X256X65536 This commit is intended to fix these test cases. Signed-off-by: wenjh <wenjh@sugon.com>
46c81675 · wenjh · 3e001bbd · 46c81675
Commit 46c81675 authored Apr 19, 2025 by wenjh
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 1 deletion

transformer_engine/common/transpose/rtc/cast_transpose.cu transformer_engine/common/transpose/rtc/cast_transpose.cu +14 -1

No files found.
--- a/transformer_engine/common/transpose/rtc/cast_transpose.cu
+++ b/transformer_engine/common/transpose/rtc/cast_transpose.cu
@@ -91,16 +91,25 @@ __global__ void __launch_bounds__(block_size) cast_transpose_optimized_kernel(
      local_output_c.store_to(&output_c[row * row_length + col]);
    }
  }
+#ifndef __HIP_PLATFORM_AMD__
  // Copy from registers to shared memory to global memory
  __shared__ OVecT shared_output_t[THREADS_PER_WARP][THREADS_PER_WARP + 1];
+#else
+  constexpr size_t inner_dim = THREADS_PER_WARP + 1;
+  constexpr size_t outter_dim = THREADS_PER_WARP;
+  __shared__ OVecT shared_output_t[outter_dim * inner_dim];
+#endif
 #pragma unroll
  for (size_t j2 = 0; j2 < nvec_in; ++j2) {
 #pragma unroll
    for (size_t iter = 0; iter < num_iterations; ++iter) {
      const size_t i1 = tidy + iter * bdimy;
      const size_t j1 = tidx;
+#ifndef __HIP_PLATFORM_AMD__
      shared_output_t[j1][i1] = local_output_t[j2][iter];
+#else
+      shared_output_t[j1 * inner_dim + i1] = local_output_t[j2][iter];
+#endif
    }
    __syncthreads();
 #pragma unroll
@@ -109,7 +118,11 @@ __global__ void __launch_bounds__(block_size) cast_transpose_optimized_kernel(
      const size_t j1 = tidy + iter * bdimy;
      const size_t row = tile_row + i1 * nvec_out;
      const size_t col = tile_col + j1 * nvec_in + j2;
+#ifndef __HIP_PLATFORM_AMD__
      shared_output_t[j1][i1].store_to(&output_t[col * num_rows + row]);
+#else
+      shared_output_t[j1 * inner_dim + i1].store_to(&output_t[col * num_rows + row]);
+#endif
    }
    __syncthreads();
  }