Commit 46c81675 authored by wenjh's avatar wenjh
Browse files

[DCU] Fix crash test cases



Due to the compiler compiling incorrect code. The following test case crashed:

* OperatorTest/CTTestSuite.TestCastTranspose/bfloat16Xbfloat16X2048X12288
* OperatorTest/CTTestSuite.TestCastTranspose/bfloat16Xbfloat16X65536X128
* OperatorTest/CTTestSuite.TestCastTranspose/bfloat16Xbfloat16X256X65536

This commit is intended to fix these test cases.
Signed-off-by: wenjh's avatarwenjh <wenjh@sugon.com>
parent 3e001bbd
...@@ -91,16 +91,25 @@ __global__ void __launch_bounds__(block_size) cast_transpose_optimized_kernel( ...@@ -91,16 +91,25 @@ __global__ void __launch_bounds__(block_size) cast_transpose_optimized_kernel(
local_output_c.store_to(&output_c[row * row_length + col]); local_output_c.store_to(&output_c[row * row_length + col]);
} }
} }
#ifndef __HIP_PLATFORM_AMD__
// Copy from registers to shared memory to global memory // Copy from registers to shared memory to global memory
__shared__ OVecT shared_output_t[THREADS_PER_WARP][THREADS_PER_WARP + 1]; __shared__ OVecT shared_output_t[THREADS_PER_WARP][THREADS_PER_WARP + 1];
#else
constexpr size_t inner_dim = THREADS_PER_WARP + 1;
constexpr size_t outter_dim = THREADS_PER_WARP;
__shared__ OVecT shared_output_t[outter_dim * inner_dim];
#endif
#pragma unroll #pragma unroll
for (size_t j2 = 0; j2 < nvec_in; ++j2) { for (size_t j2 = 0; j2 < nvec_in; ++j2) {
#pragma unroll #pragma unroll
for (size_t iter = 0; iter < num_iterations; ++iter) { for (size_t iter = 0; iter < num_iterations; ++iter) {
const size_t i1 = tidy + iter * bdimy; const size_t i1 = tidy + iter * bdimy;
const size_t j1 = tidx; const size_t j1 = tidx;
#ifndef __HIP_PLATFORM_AMD__
shared_output_t[j1][i1] = local_output_t[j2][iter]; shared_output_t[j1][i1] = local_output_t[j2][iter];
#else
shared_output_t[j1 * inner_dim + i1] = local_output_t[j2][iter];
#endif
} }
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
...@@ -109,7 +118,11 @@ __global__ void __launch_bounds__(block_size) cast_transpose_optimized_kernel( ...@@ -109,7 +118,11 @@ __global__ void __launch_bounds__(block_size) cast_transpose_optimized_kernel(
const size_t j1 = tidy + iter * bdimy; const size_t j1 = tidy + iter * bdimy;
const size_t row = tile_row + i1 * nvec_out; const size_t row = tile_row + i1 * nvec_out;
const size_t col = tile_col + j1 * nvec_in + j2; const size_t col = tile_col + j1 * nvec_in + j2;
#ifndef __HIP_PLATFORM_AMD__
shared_output_t[j1][i1].store_to(&output_t[col * num_rows + row]); shared_output_t[j1][i1].store_to(&output_t[col * num_rows + row]);
#else
shared_output_t[j1 * inner_dim + i1].store_to(&output_t[col * num_rows + row]);
#endif
} }
__syncthreads(); __syncthreads();
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment