Commit 8f0b9710 authored by Jing Zhang's avatar Jing Zhang
Browse files

global_soffset

parent 15c47cfd
...@@ -383,21 +383,25 @@ inline __device__ void ds_read_b128(data4_t& r, void* lds, index_t offset = 0) ...@@ -383,21 +383,25 @@ inline __device__ void ds_read_b128(data4_t& r, void* lds, index_t offset = 0)
} }
inline __device__ void global_load(data4_t& r, inline __device__ void global_load(data4_t& r,
const data4_t* ptr, const void* vptr,
index_t offset = 0) const void* sprt = 0)
{ {
#if !NO_GLB_READ #if !NO_GLB_READ
if(offset == 0) if(sprt == 0)
{ {
asm volatile("\n \ asm volatile("\n \
global_load_dwordx4 %0, %1, off \n \ global_load_dwordx4 %0, %1, off \n \
" "
: "=v"(r) : "=v"(r)
: "v"(ptr)); : "v"(vptr));
} }
else else
{ {
assert(false); asm volatile("\n \
global_load_dwordx4 %0, %1, %2 \n \
"
: "=v"(r)
: "v"(vptr), "s"(sprt));
} }
#endif #endif
} }
......
...@@ -491,7 +491,8 @@ struct Blockwise2dTensorCopy3 ...@@ -491,7 +491,8 @@ struct Blockwise2dTensorCopy3
} }
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* p_clipboard) const Float* p_clipboard,
const index_t voff = 0) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -518,9 +519,16 @@ struct Blockwise2dTensorCopy3 ...@@ -518,9 +519,16 @@ struct Blockwise2dTensorCopy3
constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
auto f_copy = [&](index_t iloop) { auto f_copy = [&](index_t iloop) {
#if 1
data4_t* reg = (data4_t* )&p_clipboard[iloop * DataPerRead];
const void *vptr = (void* )(uintptr_t)((mSrcMyThreadOffset + voff) * 4);
const void *sprt = (void* )&p_src[iloop * src_loop_stride];
global_load(*reg, vptr, sprt);
#else
*(reinterpret_cast<vector_t*>(&p_clipboard[iloop * DataPerRead])) = *(reinterpret_cast<vector_t*>(&p_clipboard[iloop * DataPerRead])) =
*(reinterpret_cast<const vector_t*>( *(reinterpret_cast<const vector_t*>(
&p_src[mSrcMyThreadOffset + iloop * src_loop_stride])); &p_src[mSrcMyThreadOffset + iloop * src_loop_stride + voff]));
#endif
}; };
for(index_t iloop = 0; iloop < nloop_d0; ++iloop) for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
......
...@@ -193,28 +193,32 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -193,28 +193,32 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
__shared__ Float p_in_block_double[2 * in_block_space]; __shared__ Float p_in_block_double[2 * in_block_space];
__shared__ Float p_wei_block_double[2 * wei_block_space]; __shared__ Float p_wei_block_double[2 * wei_block_space];
const Float* p_in_global_block_offset = const Float* p_in_global_block_soffset =
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); p_in_global;
const index_t p_in_global_block_voffset = in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
const Float* p_wei_global_block_offset = const Float* p_wei_global_block_soffset =
p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); p_wei_global;
const index_t p_wei_global_block_voffset = wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
// preload data into LDS // preload data into LDS
{ {
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_soffset,
p_in_register_clipboard); p_in_register_clipboard,
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, p_in_global_block_voffset);
p_wei_register_clipboard); blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_soffset,
p_wei_register_clipboard,
p_wei_global_block_voffset);
#if 0 #if 0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double); blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double); p_wei_block_double);
#else #else
global_load_waitall(); global_load_wait_all();
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_double); p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
...@@ -250,16 +254,18 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -250,16 +254,18 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0); p_in_global_block_soffset += CPerBlock * in_cb_global_desc.GetStride(I0);
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0); p_wei_global_block_soffset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
__syncthreads(); __syncthreads();
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_soffset,
p_in_register_clipboard); p_in_register_clipboard,
p_in_global_block_voffset);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_soffset,
p_wei_register_clipboard); p_wei_register_clipboard,
p_wei_global_block_voffset);
// compute on current data // compute on current data
// a series of GEMM // a series of GEMM
...@@ -286,7 +292,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -286,7 +292,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_next); p_wei_block_next);
#else #else
global_load_waitall(); global_load_wait_all();
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_next); p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
...@@ -298,19 +304,21 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -298,19 +304,21 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
// tail // tail
{ {
// even // even
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0); p_in_global_block_soffset += CPerBlock * in_cb_global_desc.GetStride(I0);
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0); p_wei_global_block_soffset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
__syncthreads(); __syncthreads();
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_soffset,
p_in_register_clipboard); p_in_register_clipboard,
p_in_global_block_voffset);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_soffset,
p_wei_register_clipboard); p_wei_register_clipboard,
p_wei_global_block_voffset);
for(index_t y = 0; y < Y; ++y) for(index_t y = 0; y < Y; ++y)
{ {
...@@ -336,7 +344,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer ...@@ -336,7 +344,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double + wei_block_space); p_wei_block_double + wei_block_space);
#else #else
global_load_waitall(); global_load_wait_all();
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_double + in_block_space); p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment