"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "05d382183f431d6166c814015527f473814480b6"
Commit 415b4cbd authored by root's avatar root
Browse files

debugging input load

parent 91ef99a7
...@@ -76,7 +76,18 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -76,7 +76,18 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
__device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
{ {
return MatrixIndex{1, 8, 8}; constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{});
constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{});
constexpr auto num_w_threads = W / WPerThread;
constexpr auto num_h_threads = H / HPerThread;
index_t k_thread_id = thread_id / (num_w_threads * num_h_threads);
index_t h_thread_id = thread_id / num_w_threads;
index_t w_thread_id = thread_id % num_w_threads;
return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
} }
template <typename SrcDesc, template <typename SrcDesc,
...@@ -127,10 +138,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -127,10 +138,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
make_tuple(Number<CYXPerThreadLoop>{}, Number<KPerThread>{})); make_tuple(Number<CYXPerThreadLoop>{}, Number<KPerThread>{}));
constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
// make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<1>{}, Number<1>{})); // make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{},
// Number<WPerThread>{}));
make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{})); make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}));
constexpr auto c_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto c_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
// make_tuple(Number<KPerThread>{}, Number<1>{},
// Number<HPerThread>{}, Number<WPerThread>{}));
make_tuple(Number<KPerThread>{}, Number<1>{})); make_tuple(Number<KPerThread>{}, Number<1>{}));
FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()]; FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()];
...@@ -147,15 +161,14 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -147,15 +161,14 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
// loop over k // loop over k
for(index_t cyx_begin = 0; cyx_begin < CYXPerBlock; cyx_begin += CYXPerThreadLoop) for(index_t cyx_begin = 0; cyx_begin < CYXPerBlock; cyx_begin += CYXPerThreadLoop)
{ {
#if 0 #if 1
a_thread_copy.Run(p_a_block + a_block_mtx.CalculateOffset(make_tuple(cyx_begin, 0)) + a_thread_copy.Run(p_a_block + a_block_mtx.CalculateOffset(make_tuple(cyx_begin, 0)) +
mMyThreadOffsetA, mMyThreadOffsetA,
p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, 0))); p_a_thread);
#else #else
for(index_t i = 0; i < a_thread_mtx.GetElementSpaceSize(); i++) for(index_t i = 0; i < a_thread_mtx.GetElementSpaceSize(); i++)
p_a_thread[i] = 1; p_a_thread[i] = 1;
#endif #endif
threadwise_gemm.Run(p_a_thread, p_b_thread + cyx_begin, p_c_thread); threadwise_gemm.Run(p_a_thread, p_b_thread + cyx_begin, p_c_thread);
} }
} }
......
...@@ -94,33 +94,33 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -94,33 +94,33 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
// divide block work by [M, N] // divide block work by [M, N]
#if 1 #if 1
const auto m_block_work_num = K / Number<KPerBlock>{}; const auto m_block_work_num = K / Number<KPerBlock>{};
const auto nhw_block_work_num = (N * H * W) / (Number<HPerBlock>{} * Number<WPerBlock>{}); const auto hw_block_work_num = (N * H * W) / (Number<HPerBlock>{} * Number<WPerBlock>{});
const index_t k_block_work_id = get_block_1d_id() / nhw_block_work_num; const index_t k_block_work_id = get_block_1d_id() / hw_block_work_num;
const index_t nhw_block_work_id = get_block_1d_id() - k_block_work_id * nhw_block_work_num; const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num;
constexpr auto h_num_threads = HPerBlock / HPerThread; constexpr auto h_num_threads = HPerBlock / HPerThread;
constexpr auto w_num_threads = WPerBlock / WPerThread; constexpr auto w_num_threads = WPerBlock / WPerThread;
static_assert(KPerBlock == KPerThread, ""); static_assert(KPerBlock == KPerThread, "");
const auto h_thread_id = get_thread_local_1d_id() / h_num_threads; const auto h_thread_id = get_thread_local_1d_id() / w_num_threads;
const auto w_thread_id = get_thread_local_1d_id() % w_num_threads; const auto w_thread_id = get_thread_local_1d_id() % w_num_threads;
#else #else
// Hack: this force result into SGPR // Hack: this force result into SGPR
const index_t m_block_work_num = __builtin_amdgcn_readfirstlane(K / KPerBlock); const index_t m_block_work_num = __builtin_amdgcn_readfirstlane(K / KPerBlock);
const index_t nhw_block_work_num = __builtin_amdgcn_readfirstlane(N / HWPerBlock); const index_t hw_block_work_num = __builtin_amdgcn_readfirstlane(N / HWPerBlock);
const index_t k_block_work_id = const index_t k_block_work_id =
__builtin_amdgcn_readfirstlane(get_block_1d_id() / nhw_block_work_num); __builtin_amdgcn_readfirstlane(get_block_1d_id() / hw_block_work_num);
const index_t nhw_block_work_id = get_block_1d_id() - k_block_work_id * nhw_block_work_num; const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num;
#endif #endif
const index_t m_block_data_on_global = k_block_work_id * KPerBlock; const index_t m_block_data_on_global = k_block_work_id * KPerBlock;
const index_t h_block_data_on_global = nhw_block_work_id * HPerBlock; const index_t h_block_data_on_global = hw_block_work_id * HPerBlock;
const index_t w_block_data_on_global = nhw_block_work_id * WPerBlock; const index_t w_block_data_on_global = hw_block_work_id * WPerBlock;
// lds max alignment // lds max alignment
constexpr auto max_lds_align = constexpr auto max_lds_align =
...@@ -166,7 +166,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -166,7 +166,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
a_cyx_k_block_desc, a_cyx_k_block_desc,
make_multi_index(0, 0)); make_multi_index(0, 0));
#if 1
constexpr auto b_cyx_n_h_w_thread_desc = constexpr auto b_cyx_n_h_w_thread_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple( make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
Number<CYXPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{})); Number<CYXPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
...@@ -191,7 +190,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -191,7 +190,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
make_multi_index( make_multi_index(
0, 0, h_block_data_on_global + h_thread_id, w_block_data_on_global + w_thread_id)); 0, 0, h_block_data_on_global + h_thread_id, w_block_data_on_global + w_thread_id));
#endif
// c_thread_mtx definition: this is a mess // c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx // TODO:: more elegent way of defining c_thread_mtx
constexpr auto c_k_n_h_w_thread_desc = constexpr auto c_k_n_h_w_thread_desc =
...@@ -264,20 +262,20 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -264,20 +262,20 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
a_blockwise_copy.RunWrite(a_cyx_k_block_desc, p_a_block_double); a_blockwise_copy.RunWrite(a_cyx_k_block_desc, p_a_block_double);
#if 0 #if 1
__syncthreads(); __syncthreads();
//blockwise_gemm.Run(p_a_block_double, p_b_thread_double, p_c_thread);
index_t sum = 0; index_t sum = 0;
for(index_t i = 0; i < b_cyx_n_h_w_thread_desc.GetElementSpaceSize(); i++) for(index_t i = 0; i < b_cyx_n_h_w_thread_desc.GetElementSpaceSize(); i++)
sum += p_b_thread[i]; sum += p_b_thread_double[i];
p_c_thread[0] = get_thread_local_1d_id() * 10000 + sum; p_c_thread[0] += p_b_thread_double[0] + p_b_thread_double[1] + p_b_thread_double[2];
p_c_thread[0] += p_b_thread_double[3] + p_b_thread_double[4] + p_b_thread_double[5];
p_c_thread[0] += p_b_thread_double[6] + p_b_thread_double[7] + p_b_thread_double[8];
#endif #endif
} }
#if 1 #if 0
if constexpr(HasMainKBlockLoop) if constexpr(HasMainKBlockLoop)
{ {
Float* p_a_block_even = p_a_block_double; Float* p_a_block_even = p_a_block_double;
...@@ -398,8 +396,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -398,8 +396,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
{ {
// define input tensor descriptor for threadwise copy // define input tensor descriptor for threadwise copy
// thread input tensor, src of threadwise copy // thread input tensor, src of threadwise copy
constexpr auto c_k_n_h_w_thread_desc = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto c_k_n_h_w_thread_desc =
make_tuple(Number<KPerThread>{}, Number<1>{}, Number<1>{}, Number<1>{})); make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
Number<KPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
// calculate origin of thread input tensor on global memory // calculate origin of thread input tensor on global memory
// blockwise GEMM c matrix starting index // blockwise GEMM c matrix starting index
...@@ -414,8 +413,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -414,8 +413,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
n_block_data_on_global + c_thread_mtx_on_block.col; n_block_data_on_global + c_thread_mtx_on_block.col;
#endif #endif
const index_t m_thread_data_on_global = m_block_data_on_global; const index_t m_thread_data_on_global = m_block_data_on_global;
const index_t h_thread_data_on_global = h_block_data_on_global + h_thread_id; const index_t h_thread_data_on_global =
const index_t w_thread_data_on_global = w_block_data_on_global + w_thread_id; h_block_data_on_global + h_thread_id * HPerThread;
const index_t w_thread_data_on_global =
w_block_data_on_global + w_thread_id * WPerThread;
// hack to control index calculation when iterating over c_k_n_h_w_global tensor // hack to control index calculation when iterating over c_k_n_h_w_global tensor
constexpr auto c_k_n_h_w_global_tensor_iterator_hacks = CGlobalIteratorHacks{}; constexpr auto c_k_n_h_w_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
......
...@@ -73,20 +73,20 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc ...@@ -73,20 +73,20 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr index_t KPerBlock = 16; constexpr index_t KPerBlock = 16;
constexpr index_t HPerBlock = 8; constexpr index_t HPerBlock = 8;
constexpr index_t WPerBlock = 8; constexpr index_t WPerBlock = 8;
constexpr index_t CYXPerBlock = 4; constexpr index_t CYXPerBlock = 4 * 3 * 3;
constexpr index_t KPerThread = 16; constexpr index_t KPerThread = 16;
constexpr index_t HPerThread = 1; constexpr index_t HPerThread = 1;
constexpr index_t WPerThread = 1; constexpr index_t WPerThread = 1;
constexpr index_t CYXPerThread = 4; constexpr index_t CYXPerThread = 3 * 3;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<1, 1>; using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<9, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>; using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1; constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1; constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<4, 1>; using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<36, 1>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 64>; using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 64>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment