Commit 15c47cfd authored by Jing Zhang's avatar Jing Zhang
Browse files

clean asm

parent c075d3f7
...@@ -661,9 +661,9 @@ int main(int argc, char* argv[]) ...@@ -661,9 +661,9 @@ int main(int argc, char* argv[])
device_direct_convolution_2_nchw_kcyx_nkhw device_direct_convolution_2_nchw_kcyx_nkhw
#elif 0 #elif 0
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 1
device_implicit_gemm_convolution_1_chwn_cyxk_khwn
#elif 0 #elif 0
device_implicit_gemm_convolution_1_chwn_cyxk_khwn
#elif 1
device_implicit_gemm_convolution_2_chwn_cyxk_khwn device_implicit_gemm_convolution_2_chwn_cyxk_khwn
#endif #endif
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
......
...@@ -10,7 +10,11 @@ ...@@ -10,7 +10,11 @@
// cast a pointer of LDS to its address // cast a pointer of LDS to its address
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
__device__ void vmcnt(index_t cnt) #define data4_t vector_type<float, 4>::MemoryType
#define data_t float
template<unsigned cnt>
inline __device__ void vmcnt()
{ {
#if !NO_VM_WAIT #if !NO_VM_WAIT
if(cnt == 0) if(cnt == 0)
...@@ -31,10 +35,16 @@ __device__ void vmcnt(index_t cnt) ...@@ -31,10 +35,16 @@ __device__ void vmcnt(index_t cnt)
s_waitcnt vmcnt(2) \n \ s_waitcnt vmcnt(2) \n \
" ::); " ::);
} }
else if(cnt == 3)
{
asm volatile("\n \
s_waitcnt vmcnt(3) \n \
" ::);
}
else if(cnt == 4) else if(cnt == 4)
{ {
asm volatile("\n \ asm volatile("\n \
s_waitcnt vmcnt(2) \n \ s_waitcnt vmcnt(4) \n \
" ::); " ::);
} }
else else
...@@ -44,7 +54,7 @@ __device__ void vmcnt(index_t cnt) ...@@ -44,7 +54,7 @@ __device__ void vmcnt(index_t cnt)
#endif #endif
} }
__device__ void lgkmcnt(index_t cnt) inline __device__ void lgkmcnt(index_t cnt)
{ {
#if !NO_LGKM_WAIT #if !NO_LGKM_WAIT
if(cnt == 0) if(cnt == 0)
...@@ -84,31 +94,11 @@ __device__ void lgkmcnt(index_t cnt) ...@@ -84,31 +94,11 @@ __device__ void lgkmcnt(index_t cnt)
#endif #endif
} }
__device__ void outerProduct1x4(const float* a, const float* b, float* c) inline __device__ void outerProduct1x4(const data_t& a,
const data4_t& b,
data4_t& c)
{ {
asm volatile("\n \ #if 0
v_mac_f32 %0, %4, %5 \n \
v_mac_f32 %1, %4, %6 \n \
v_mac_f32 %2, %4, %7 \n \
v_mac_f32 %3, %4, %8 \n \
"
: "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3])
: "v"(a[0]),
"v"(b[0]),
"v"(b[1]),
"v"(b[2]),
"v"(b[3]),
"0"(c[0]),
"1"(c[1]),
"2"(c[2]),
"3"(c[3]));
}
__device__ void outerProduct1x4(const float& a,
const vector_type<float, 4>::MemoryType& b,
vector_type<float, 4>::MemoryType& c)
{
#if 0
asm volatile( asm volatile(
"\n \ "\n \
v_mac_f32 %0, %4, %5 \n \ v_mac_f32 %0, %4, %5 \n \
...@@ -121,73 +111,44 @@ __device__ void outerProduct1x4(const float& a, ...@@ -121,73 +111,44 @@ __device__ void outerProduct1x4(const float& a,
"v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w) "v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
); );
#else #else
outerProduct1x4(&a, (float*)&b, (float*)&c); //hijack here due to a compiler issue that cannot perform proper register
#endif //mapping for float4 c
} data_t *c_p = (data_t *)&c;
asm volatile("\n \
__device__ void outerProduct4x4(const vector_type<float, 4>::MemoryType& a,
const vector_type<float, 4>::MemoryType& b,
vector_type<float, 4>::MemoryType& c0,
vector_type<float, 4>::MemoryType& c1,
vector_type<float, 4>::MemoryType& c2,
vector_type<float, 4>::MemoryType& c3)
{
#if 0
asm volatile(
"\n \
v_mac_f32 %0, %4, %5 \n \
v_mac_f32 %1, %4, %6 \n \
v_mac_f32 %2, %4, %7 \n \
v_mac_f32 %3, %4, %8 \n \
"
:
:"v"(c0.x),"v"(c0.y),"v"(c0.z),"v"(c0.w), \
"v"(a.x),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
);
asm volatile(
"\n \
v_mac_f32 %0, %4, %5 \n \
v_mac_f32 %1, %4, %6 \n \
v_mac_f32 %2, %4, %7 \n \
v_mac_f32 %3, %4, %8 \n \
"
:
:"v"(c1.x),"v"(c1.y),"v"(c1.z),"v"(c1.w), \
"v"(a.y),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
);
asm volatile(
"\n \
v_mac_f32 %0, %4, %5 \n \
v_mac_f32 %1, %4, %6 \n \
v_mac_f32 %2, %4, %7 \n \
v_mac_f32 %3, %4, %8 \n \
"
:
:"v"(c2.x),"v"(c2.y),"v"(c2.z),"v"(c2.w), \
"v"(a.z),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w)
);
asm volatile(
"\n \
v_mac_f32 %0, %4, %5 \n \ v_mac_f32 %0, %4, %5 \n \
v_mac_f32 %1, %4, %6 \n \ v_mac_f32 %1, %4, %6 \n \
v_mac_f32 %2, %4, %7 \n \ v_mac_f32 %2, %4, %7 \n \
v_mac_f32 %3, %4, %8 \n \ v_mac_f32 %3, %4, %8 \n \
" "
: : "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3])
:"v"(c3.x),"v"(c3.y),"v"(c3.z),"v"(c3.w), \ : "v"(a),
"v"(a.w),"v"(b.x),"v"(b.y),"v"(b.z),"v"(b.w) "v"(b.x),
); "v"(b.y),
#else "v"(b.z),
"v"(b.w),
"0"(c[0]),
"1"(c[1]),
"2"(c[2]),
"3"(c[3]));
#endif
}
inline __device__ void outerProduct4x4(const data4_t& a,
const data4_t& b,
data4_t& c0,
data4_t& c1,
data4_t& c2,
data4_t& c3)
{
outerProduct1x4(a.x, b, c0); outerProduct1x4(a.x, b, c0);
outerProduct1x4(a.y, b, c1); outerProduct1x4(a.y, b, c1);
outerProduct1x4(a.z, b, c2); outerProduct1x4(a.z, b, c2);
outerProduct1x4(a.w, b, c3); outerProduct1x4(a.w, b, c3);
#endif
} }
__device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a, inline __device__ void outerProduct8x8(const data4_t* a,
const vector_type<float, 4>::MemoryType* b, const data4_t* b,
vector_type<float, 4>::MemoryType* c) data4_t* c)
{ {
outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]); outerProduct4x4(a[0], b[0], c[0], c[2], c[4], c[6]);
outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]); outerProduct4x4(a[0], b[1], c[1], c[3], c[5], c[7]);
...@@ -195,7 +156,7 @@ __device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a, ...@@ -195,7 +156,7 @@ __device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]); outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
} }
__device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0) inline __device__ void ds_read_b128(data4_t& r, void* lds, index_t offset = 0)
{ {
#if !NO_DS_READ #if !NO_DS_READ
if(offset == 0) if(offset == 0)
...@@ -421,8 +382,8 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in ...@@ -421,8 +382,8 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in
#endif #endif
} }
__device__ void global_load(vector_type<float, 4>::MemoryType& r, inline __device__ void global_load(data4_t& r,
const vector_type<float, 4>::MemoryType* ptr, const data4_t* ptr,
index_t offset = 0) index_t offset = 0)
{ {
#if !NO_GLB_READ #if !NO_GLB_READ
...@@ -441,8 +402,8 @@ __device__ void global_load(vector_type<float, 4>::MemoryType& r, ...@@ -441,8 +402,8 @@ __device__ void global_load(vector_type<float, 4>::MemoryType& r,
#endif #endif
} }
__device__ void inline __device__ void
ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0) ds_write_b128(const data4_t& r, void* lds, index_t offset = 0)
{ {
#if !NO_DS_WRITE #if !NO_DS_WRITE
if(offset == 0) if(offset == 0)
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "threadwise_2d_tensor_op.hip.hpp" #include "threadwise_2d_tensor_op.hip.hpp"
#include "threadwise_nd_tensor_op.hip.hpp" #include "threadwise_nd_tensor_op.hip.hpp"
#include "blockwise_gemm.hip.hpp" #include "blockwise_gemm.hip.hpp"
#include "gridwise_ops.hip.hpp"
// define B = flatten(N, Hi, Wi) // define B = flatten(N, Hi, Wi)
template <index_t GridSize, template <index_t GridSize,
...@@ -208,12 +209,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -208,12 +209,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
p_wei_register_clipboard); p_wei_register_clipboard);
#if 1 #if 0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double); blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double); p_wei_block_double);
#else #else
vmcnt(0); global_load_waitall();
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_double); p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
...@@ -266,11 +267,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -266,11 +267,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 1 #if 0
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 0 #elif 1
blockwise_gemm.Run_asm blockwise_gemm.Run_asm
#endif #endif
(p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
...@@ -279,13 +280,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -279,13 +280,13 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
} }
} }
#if 1 #if 0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_next); p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_next); p_wei_block_next);
#else #else
vmcnt(0); global_load_waitall();
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_next); p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
...@@ -315,11 +316,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -315,11 +316,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 1 #if 0
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 0 #elif 1
blockwise_gemm.Run_asm blockwise_gemm.Run_asm
#endif #endif
(p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
...@@ -328,14 +329,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -328,14 +329,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
} }
} }
#if 1 #if 0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
p_in_block_double + in_block_space); p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double + wei_block_space); p_wei_block_double + wei_block_space);
#else #else
vmcnt(0); global_load_waitall();
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_double + in_block_space); p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
...@@ -349,11 +350,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -349,11 +350,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
{ {
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
#if 1 #if 0
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#elif 0 #elif 1
blockwise_gemm.Run_asm blockwise_gemm.Run_asm
#endif #endif
(p_wei_block_double + wei_block_space + (p_wei_block_double + wei_block_space +
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment