"git@developer.sourcefind.cn:OpenDAS/dlib.git" did not exist on "b10b2e1e7f1223fcd1201f517d893a6c6d4e14c7"
Commit 155d7859 authored by Jing Zhang's avatar Jing Zhang
Browse files

optimize global_load

parent 05d7a087
...@@ -301,8 +301,21 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn ...@@ -301,8 +301,21 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
__syncthreads()) __syncthreads())
{ {
// load data // load data
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block); //blockwise_in_copy.Run(p_in_global_block_offset, p_in_block);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block); //blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block);
Float4 tmp_in, tmp_wei;
Float4* glb_in_p = (Float4 *)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset);
Float4* loc_in_p = (Float4 *)(p_in_block + blockwise_in_copy.mDstMyThreadOffset);
Float4* glb_wei_p = (Float4 *)(p_wei_global_block_offset + blockwise_wei_copy.mSrcMyThreadOffset);
Float4* loc_wei_p = (Float4 *)(p_wei_block + blockwise_wei_copy.mDstMyThreadOffset);
global_load(tmp_in, glb_in_p);
global_load(tmp_wei, glb_wei_p);
vmcnt(0);
ds_write_b128(tmp_in, loc_in_p);
ds_write_b128(tmp_wei, loc_wei_p);
__syncthreads(); __syncthreads();
......
...@@ -4,6 +4,32 @@ typedef float Float4 __attribute__((ext_vector_type(4))); ...@@ -4,6 +4,32 @@ typedef float Float4 __attribute__((ext_vector_type(4)));
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
inline __device__ void vmcnt(int cnt) {
if(cnt == 0) {
asm volatile ("\n \
s_waitcnt vmcnt(0) \n \
"::);
}
else if(cnt == 1) {
asm volatile ("\n \
s_waitcnt vmcnt(1) \n \
"::);
}
else if(cnt == 2) {
asm volatile ("\n \
s_waitcnt vmcnt(2) \n \
"::);
}
else if(cnt == 4) {
asm volatile ("\n \
s_waitcnt vmcnt(2) \n \
"::);
}
else {
assert(0);
}
}
inline __device__ void lgkmcnt(int cnt) inline __device__ void lgkmcnt(int cnt)
{ {
#if 1 #if 1
...@@ -370,3 +396,23 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, int offset = 0) ...@@ -370,3 +396,23 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, int offset = 0)
assert(0); assert(0);
} }
} }
inline __device__ void global_load(Float4 &r, Float4* ptr) {
asm volatile("\n \
global_load_dwordx4 %0, %1, off \n \
"
:"=v"(r)
:"v"(ptr)
);
}
inline __device__ void ds_write_b128(Float4& r, void* lds, int offset = 0)
{
asm volatile("\n \
ds_write_b128 %0, %1 \n \
"
:
: "v"(__to_local(lds)), "v"(r)
);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment