Commit 76c4719e authored by Bartlomiej Kocot's avatar Bartlomiej Kocot
Browse files

Skip inline version to builtin

parent 8a9daf16
...@@ -118,8 +118,12 @@ ...@@ -118,8 +118,12 @@
// inline asm // inline asm
#define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INLINE_ASM 1
// inner product (DLOP) // inner product (V_MAC/V_FMAC)
#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1 #define CK_USE_AMD_V_MAC_INLINE_ASM 1
// V_DOT inline instructions, less efficient by the need to add s_nop to avoid
// hazard
#define CK_USE_AMD_V_DOT_INLINE_ASM 0
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0) // block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1 #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
......
...@@ -13,13 +13,13 @@ __device__ void inner_product(const TA& a, const TB& b, TC& c); ...@@ -13,13 +13,13 @@ __device__ void inner_product(const TA& a, const TB& b, TC& c);
template <> template <>
__device__ void inner_product<float, float, float>(const float& a, const float& b, float& c) __device__ void inner_product<float, float, float>(const float& a, const float& b, float& c)
{ {
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32) #if CK_USE_AMD_V_MAC_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
asm volatile("\n \ asm volatile("\n \
v_mac_f32 %0, %1, %2 \n \ v_mac_f32 %0, %1, %2 \n \
" "
: "=v"(c) : "=v"(c)
: "v"(a), "v"(b), "0"(c)); : "v"(a), "v"(b), "0"(c));
#elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32) #elif CK_USE_AMD_V_MAC_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
asm volatile("\n \ asm volatile("\n \
v_fmac_f32 %0, %1, %2 \n \ v_fmac_f32 %0, %1, %2 \n \
" "
...@@ -76,7 +76,7 @@ template <> ...@@ -76,7 +76,7 @@ template <>
__device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c) __device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
{ {
#if defined(CK_USE_AMD_V_DOT2_F32_F16) #if defined(CK_USE_AMD_V_DOT2_F32_F16)
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM #if CK_USE_AMD_V_DOT_INLINE_ASM
// Use 3 x s_nop to avoid hazard (mi200 cdna2 isa) // Use 3 x s_nop to avoid hazard (mi200 cdna2 isa)
asm volatile("\n \ asm volatile("\n \
v_dot2_f32_f16 %0, %1, %2, %0\n \ v_dot2_f32_f16 %0, %1, %2, %0\n \
...@@ -165,7 +165,7 @@ __device__ void ...@@ -165,7 +165,7 @@ __device__ void
inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c) inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
{ {
#if defined(CK_USE_AMD_V_DOT4_I32_I8) #if defined(CK_USE_AMD_V_DOT4_I32_I8)
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM #if CK_USE_AMD_V_DOT_INLINE_ASM
// Use 3 x s_nop to avoid hazard (mi200 cdna2 isa) // Use 3 x s_nop to avoid hazard (mi200 cdna2 isa)
asm volatile("\n \ asm volatile("\n \
v_dot4_i32_i8 %0, %1, %2, %0\n \ v_dot4_i32_i8 %0, %1, %2, %0\n \
......
...@@ -3,13 +3,6 @@ ...@@ -3,13 +3,6 @@
## GPU visibility ## GPU visibility
export HIP_VISIBLE_DEVICES=0 export HIP_VISIBLE_DEVICES=0
DRIVER="../build/bin/ckProfiler" DRIVER="../build/bin/ckProfiler"
OP=$1
DATATYPE=$2
LAYOUT=$3
VERIFY=$4
INIT=$5
LOG=$6
TIME=$7
OP=$1 OP=$1
DATATYPE=$2 DATATYPE=$2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment