Commit 18328e2f authored by Chao Liu's avatar Chao Liu
Browse files

experimenting lds read

parent 4facbe99
...@@ -593,9 +593,9 @@ int main(int argc, char* argv[]) ...@@ -593,9 +593,9 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 1 #elif 1
// 1x1 filter, 14x14 image, C = 256 // 1x1 filter, 14x14 image, C = 128
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 256; constexpr index_t C = 128;
constexpr index_t HI = 14; constexpr index_t HI = 14;
constexpr index_t WI = 14; constexpr index_t WI = 14;
constexpr index_t K = 512; constexpr index_t K = 512;
...@@ -638,10 +638,16 @@ int main(int argc, char* argv[]) ...@@ -638,10 +638,16 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
#if 1 #if 0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 0 #elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 1
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#elif 0 #elif 0
......
...@@ -336,7 +336,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -336,7 +336,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
__device__ void Run(const FloatA* __restrict__ p_a_block, __device__ void Run(const FloatA* __restrict__ p_a_block,
const FloatB* __restrict__ p_b_block, const FloatB* __restrict__ p_b_block,
FloatC* __restrict__ p_c_thread, FloatC* __restrict__ p_c_thread,
Accumulator f_accum) const Accumulator f_accum,
const float* const p_lds_begin) const
{ {
constexpr auto True = integral_constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{}; constexpr auto False = integral_constant<bool, false>{};
...@@ -383,28 +384,36 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -383,28 +384,36 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
// copy A-sub to form A // copy A-sub to form A
for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat) for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{ {
threadwise_matrix_copy( threadwise_matrix_copy_v2(
a_block_mtx, a_block_mtx,
p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
mMyThreadOffsetA, mMyThreadOffsetA,
a_thread_mtx, a_thread_mtx,
p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
a_thread_sub_mtx.GetLengths()); a_thread_sub_mtx.GetLengths(),
p_lds_begin);
} }
#pragma unroll #pragma unroll
// copy B-sub to form B // copy B-sub to form B
for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat) for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
{ {
threadwise_matrix_copy( threadwise_matrix_copy_v2(
b_block_mtx, b_block_mtx,
p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
mMyThreadOffsetB, mMyThreadOffsetB,
b_thread_mtx, b_thread_mtx,
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
b_thread_sub_mtx.GetLengths()); b_thread_sub_mtx.GetLengths(),
p_lds_begin);
} }
#if 0
asm volatile("\n \
s_waitcnt lgkmcnt(0) \n \
" ::);
#endif
// C = A * B // C = A * B
threadwise_gemm(a_thread_mtx, threadwise_gemm(a_thread_mtx,
True, True,
...@@ -564,7 +573,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -564,7 +573,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
FloatB* const p_b_block, FloatB* const p_b_block,
FloatC* p_c_thread, FloatC* p_c_thread,
Accumulator f_accum, Accumulator f_accum,
float* p_lds_begin) const const float* const p_lds_begin) const
{ {
constexpr auto True = integral_constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{}; constexpr auto False = integral_constant<bool, false>{};
...@@ -669,7 +678,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -669,7 +678,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_lds_begin); p_lds_begin);
} }
#if 1 #if 0
asm volatile("\n \ asm volatile("\n \
s_waitcnt lgkmcnt(0) \n \ s_waitcnt lgkmcnt(0) \n \
" ::); " ::);
......
...@@ -207,7 +207,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric ...@@ -207,7 +207,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
__shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
#if 1 #if 1
constexpr Float* p_lds_begin = p_wei_block; const Float* p_lds_begin = p_in_block < p_wei_block ? p_in_block : p_wei_block;
#endif #endif
const Float* p_in_global_block_offset = const Float* p_in_global_block_offset =
...@@ -240,18 +240,18 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric ...@@ -240,18 +240,18 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
for(index_t x = 0; x < X; ++x) for(index_t x = 0; x < X; ++x)
{ {
auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
#if 0 #if 1
blockwise_gemm.Run blockwise_gemm.Run
#elif 0 #elif 0
blockwise_gemm.Run_asm blockwise_gemm.Run_asm
#elif 1 #elif 0
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
#endif #endif
(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
p_in_block + y * Wi + x, p_in_block + y * Wi + x,
p_out_thread, p_out_thread,
f_accum, f_accum,
p_lds_begin); p_lds_begin);
} }
} }
} }
......
...@@ -28,12 +28,12 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix, ...@@ -28,12 +28,12 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
DstMatrix, DstMatrix,
Float* __restrict__ p_dst, Float* __restrict__ p_dst,
Sequence<NRow, NCol>, Sequence<NRow, NCol>,
const float* p_lds_begin) const float* const p_lds_begin)
{ {
constexpr auto src_mtx = SrcMatrix{}; constexpr auto src_mtx = SrcMatrix{};
constexpr auto dst_mtx = DstMatrix{}; constexpr auto dst_mtx = DstMatrix{};
#if 1 #if 0
for(index_t i = 0; i < NRow; ++i) for(index_t i = 0; i < NRow; ++i)
{ {
for(index_t j = 0; j < NCol; ++j) for(index_t j = 0; j < NCol; ++j)
...@@ -48,11 +48,11 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix, ...@@ -48,11 +48,11 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
ds_read_b32 %0, %1 \n \ ds_read_b32 %0, %1 \n \
" "
: "=v"(p_dst[dst_index]) : "=v"(p_dst[dst_index])
: "v"((uint32_t)((uintptr_t)((p_src + src_index) - p_lds_begin)))); : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
#endif #endif
} }
} }
#elif 0 #elif 1
static_assert(NCol == 4, "only for NCol == 4"); static_assert(NCol == 4, "only for NCol == 4");
using vector_t = typename vector_type<Float, 4>::MemoryType; using vector_t = typename vector_type<Float, 4>::MemoryType;
...@@ -66,11 +66,12 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix, ...@@ -66,11 +66,12 @@ __device__ void threadwise_matrix_copy_v2(SrcMatrix,
*(reinterpret_cast<vector_t*>(p_dst + dst_index)) = *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
*(reinterpret_cast<const vector_t*>(p_src + src_index)); *(reinterpret_cast<const vector_t*>(p_src + src_index));
#elif 1 #elif 1
asm volatile("\n \ asm volatile(
ds_read_b128 %0, %1, offset:0 \n \ "\n \
ds_read_b128 %0, %1 \n \
" "
: "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index))) : "=v"(*(reinterpret_cast<vector_t*>(p_dst + dst_index)))
: "v"((uint32_t)((uintptr_t)(p_src + src_index - p_lds_begin)))); : "v"((uint32_t)(sizeof(Float) * (uintptr_t)((p_src + src_index) - p_lds_begin))));
#endif #endif
} }
#endif #endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment