Commit 8f0b9710 authored by Jing Zhang's avatar Jing Zhang
Browse files

global_soffset

parent 15c47cfd
......@@ -383,21 +383,25 @@ inline __device__ void ds_read_b128(data4_t& r, void* lds, index_t offset = 0)
}
inline __device__ void global_load(data4_t& r,
const data4_t* ptr,
index_t offset = 0)
const void* vptr,
const void* sprt = 0)
{
#if !NO_GLB_READ
if(offset == 0)
if(sprt == 0)
{
asm volatile("\n \
global_load_dwordx4 %0, %1, off \n \
"
: "=v"(r)
: "v"(ptr));
: "v"(vptr));
}
else
{
assert(false);
asm volatile("\n \
global_load_dwordx4 %0, %1, %2 \n \
"
: "=v"(r)
: "v"(vptr), "s"(sprt));
}
#endif
}
......
......@@ -491,7 +491,8 @@ struct Blockwise2dTensorCopy3
}
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* p_clipboard) const
Float* p_clipboard,
const index_t voff = 0) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -518,9 +519,16 @@ struct Blockwise2dTensorCopy3
constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
auto f_copy = [&](index_t iloop) {
#if 1
data4_t* reg = (data4_t* )&p_clipboard[iloop * DataPerRead];
const void *vptr = (void* )(uintptr_t)((mSrcMyThreadOffset + voff) * 4);
const void *sprt = (void* )&p_src[iloop * src_loop_stride];
global_load(*reg, vptr, sprt);
#else
*(reinterpret_cast<vector_t*>(&p_clipboard[iloop * DataPerRead])) =
*(reinterpret_cast<const vector_t*>(
&p_src[mSrcMyThreadOffset + iloop * src_loop_stride]));
&p_src[mSrcMyThreadOffset + iloop * src_loop_stride + voff]));
#endif
};
for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
......
......@@ -193,28 +193,32 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
__shared__ Float p_in_block_double[2 * in_block_space];
__shared__ Float p_wei_block_double[2 * wei_block_space];
const Float* p_in_global_block_offset =
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
const Float* p_in_global_block_soffset =
p_in_global;
const index_t p_in_global_block_voffset = in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
const Float* p_wei_global_block_offset =
p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
const Float* p_wei_global_block_soffset =
p_wei_global;
const index_t p_wei_global_block_voffset = wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
// preload data into LDS
{
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
p_in_register_clipboard);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
p_wei_register_clipboard);
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_soffset,
p_in_register_clipboard,
p_in_global_block_voffset);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_soffset,
p_wei_register_clipboard,
p_wei_global_block_voffset);
#if 0
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double);
#else
global_load_waitall();
global_load_wait_all();
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_double);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
......@@ -250,16 +254,18 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
p_in_global_block_soffset += CPerBlock * in_cb_global_desc.GetStride(I0);
p_wei_global_block_soffset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
__syncthreads();
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
p_in_register_clipboard);
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_soffset,
p_in_register_clipboard,
p_in_global_block_voffset);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
p_wei_register_clipboard);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_soffset,
p_wei_register_clipboard,
p_wei_global_block_voffset);
// compute on current data
// a series of GEMM
......@@ -286,7 +292,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_next);
#else
global_load_waitall();
global_load_wait_all();
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_next);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
......@@ -298,19 +304,21 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
// tail
{
// even
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
p_in_global_block_soffset += CPerBlock * in_cb_global_desc.GetStride(I0);
p_wei_global_block_soffset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
__syncthreads();
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
p_in_register_clipboard);
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_soffset,
p_in_register_clipboard,
p_in_global_block_voffset);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
p_wei_register_clipboard);
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_soffset,
p_wei_register_clipboard,
p_wei_global_block_voffset);
for(index_t y = 0; y < Y; ++y)
{
......@@ -336,7 +344,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
p_wei_block_double + wei_block_space);
#else
global_load_waitall();
global_load_wait_all();
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
p_in_block_double + in_block_space);
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment