"...composable_kernel.git" did not exist on "c0f698d5c09df5cfd97ff0520b86a8b5f6eccc54"
Commit df4cc03f authored by Bartlomiej's avatar Bartlomiej Committed by Bartlomiej Kocot
Browse files

Add s_nops after v_dot to avoid hazard

parent e7dca79d
...@@ -70,10 +70,9 @@ __global__ void ...@@ -70,10 +70,9 @@ __global__ void
const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
const Block2CTileMap block_2_ctile_map) const Block2CTileMap block_2_ctile_map)
{ {
// TODO: Enable for gfx90a after complier fix #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \ defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || defined(__gfx1101__) || \ defined(__gfx1101__) || defined(__gfx1102__))
defined(__gfx1102__))
const index_t num_blocks_per_batch = const index_t num_blocks_per_batch =
__builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
...@@ -650,10 +649,10 @@ struct DeviceBatchedGemmMultipleD_Dl : public DeviceBatchedGemmMultiD<ALayout, ...@@ -650,10 +649,10 @@ struct DeviceBatchedGemmMultipleD_Dl : public DeviceBatchedGemmMultiD<ALayout,
static bool IsSupportedArgument(const Argument& arg) static bool IsSupportedArgument(const Argument& arg)
{ {
// TODO: Enable for gfx90a after complier fix // TODO: Enable for gfx90a after complier fix
if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx908" || if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx90a" ||
ck::get_device_name() == "gfx1030" || ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx1030" ||
ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx1100" ||
ck::get_device_name() == "gfx1102") ck::get_device_name() == "gfx1101" || ck::get_device_name() == "gfx1102")
{ {
bool pass = true; bool pass = true;
pass = pass && arg.K_ % K1 == 0; pass = pass && arg.K_ % K1 == 0;
......
...@@ -75,16 +75,17 @@ inner_product<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, f ...@@ -75,16 +75,17 @@ inner_product<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, f
template <> template <>
__device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c) __device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
{ {
#if defined(CK_USE_AMD_V_DOT2_F32_F16) // builtin is disabled because it does not generate s_nop
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM // and this can lead to hazards
#if defined(CK_USE_AMD_V_DOT2_F32_F16) && CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
// Use 3 x s_nop to avoid hazard (mi200 cdna2 isa)
asm volatile("\n \ asm volatile("\n \
v_dot2_f32_f16 %0, %1, %2, %0\n \ v_dot2_f32_f16 %0, %1, %2, %0\n \
s_nop 2 \n \
" "
: "=v"(c) : "=v"(c)
: "v"(a), "v"(b), "0"(c)); : "v"(a), "v"(b), "0"(c));
#else
c = __builtin_amdgcn_sdot2(a, b, c, false); c = __builtin_amdgcn_sdot2(a, b, c, false);
#endif
#else #else
const vector_type<half_t, 2> a_vector{a}; const vector_type<half_t, 2> a_vector{a};
const vector_type<half_t, 2> b_vector{b}; const vector_type<half_t, 2> b_vector{b};
...@@ -162,16 +163,16 @@ template <> ...@@ -162,16 +163,16 @@ template <>
__device__ void __device__ void
inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c) inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
{ {
#if defined(CK_USE_AMD_V_DOT4_I32_I8) // builtin is disabled because it does not generate s_nop
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM // and this can lead to hazards
#if defined(CK_USE_AMD_V_DOT4_I32_I8) && CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
// Use 3 x s_nop to avoid hazard (mi200 cdna2 isa)
asm volatile("\n \ asm volatile("\n \
v_dot4_i32_i8 %0, %1, %2, %0\n \ v_dot4_i32_i8 %0, %1, %2, %0\n \
s_nop 2 \n \
" "
: "=v"(c) : "=v"(c)
: "v"(bit_cast<int32_t>(a)), "v"(bit_cast<int32_t>(b)), "0"(c)); : "v"(bit_cast<int32_t>(a)), "v"(bit_cast<int32_t>(b)), "0"(c));
#else
c = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b), c, false);
#endif
#else #else
const vector_type<int8_t, 4> a_vector{a}; const vector_type<int8_t, 4> a_vector{a};
const vector_type<int8_t, 4> b_vector{b}; const vector_type<int8_t, 4> b_vector{b};
......
# TODO: Enable for gfx90a after complier fix # TODO: Enable for gfx90a after complier fix
if(DL_KERNELS) if(DL_KERNELS)
if(NOT GPU_TARGETS MATCHES "gfx90a") add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp)
add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp) target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance)
endif()
endif() endif()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment