Commit 1e55a3b1 authored by Chao Liu's avatar Chao Liu
Browse files

add v_fmac inline asm

parent 2a87a973
...@@ -8,18 +8,37 @@ namespace ck { ...@@ -8,18 +8,37 @@ namespace ck {
// outer-product: c[i,j] += inner_product(a[i], b[j]) // outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1) __device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
{ {
#if CK_USE_AMD_V_FMAC_F32
asm volatile("\n \
v_fmac_f32 %0, %2, %3 \n \
v_fmac_f32 %1, %2, %4 \n \
"
: "=v"(c0), "=v"(c1)
: "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
#else
asm volatile("\n \ asm volatile("\n \
v_mac_f32 %0, %2, %3 \n \ v_mac_f32 %0, %2, %3 \n \
v_mac_f32 %1, %2, %4 \n \ v_mac_f32 %1, %2, %4 \n \
" "
: "=v"(c0), "=v"(c1) : "=v"(c0), "=v"(c1)
: "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1)); : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
#endif
} }
// outer-product: c[i,j] += inner_product(a[i], b[j]) // outer-product: c[i,j] += inner_product(a[i], b[j])
__device__ void amd_assembly_outer_product_1x4( __device__ void amd_assembly_outer_product_1x4(
float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3) float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3)
{ {
#if CK_USE_AMD_V_FMAC_F32
asm volatile("\n \
v_fmac_f32 %0, %4, %5 \n \
v_fmac_f32 %1, %4, %6 \n \
v_fmac_f32 %2, %4, %7 \n \
v_fmac_f32 %3, %4, %8 \n \
"
: "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
: "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
#else
asm volatile("\n \ asm volatile("\n \
v_mac_f32 %0, %4, %5 \n \ v_mac_f32 %0, %4, %5 \n \
v_mac_f32 %1, %4, %6 \n \ v_mac_f32 %1, %4, %6 \n \
...@@ -28,6 +47,7 @@ __device__ void amd_assembly_outer_product_1x4( ...@@ -28,6 +47,7 @@ __device__ void amd_assembly_outer_product_1x4(
" "
: "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
: "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3)); : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
#endif
} }
// outer-product: c[i,j] += inner_product(a[i], b[j]) // outer-product: c[i,j] += inner_product(a[i], b[j])
......
...@@ -26,6 +26,10 @@ ...@@ -26,6 +26,10 @@
#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1 #define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif #endif
#ifndef CK_USE_AMD_V_FMAC_F32
#define CK_USE_AMD_V_FMAC_F32 1
#endif
// AMD buffer addressing // AMD buffer addressing
#ifndef CK_USE_AMD_BUFFER_ADDRESSING #ifndef CK_USE_AMD_BUFFER_ADDRESSING
#define CK_USE_AMD_BUFFER_ADDRESSING 1 #define CK_USE_AMD_BUFFER_ADDRESSING 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment