Commit 415b4cbd authored by root's avatar root
Browse files

debugging input load

parent 91ef99a7
......@@ -76,7 +76,18 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
__device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
{
return MatrixIndex{1, 8, 8};
constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{});
constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{});
constexpr auto num_w_threads = W / WPerThread;
constexpr auto num_h_threads = H / HPerThread;
index_t k_thread_id = thread_id / (num_w_threads * num_h_threads);
index_t h_thread_id = thread_id / num_w_threads;
index_t w_thread_id = thread_id % num_w_threads;
return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
}
template <typename SrcDesc,
......@@ -127,10 +138,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
make_tuple(Number<CYXPerThreadLoop>{}, Number<KPerThread>{}));
constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
// make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<1>{}, Number<1>{}));
// make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{},
// Number<WPerThread>{}));
make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}));
constexpr auto c_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
// make_tuple(Number<KPerThread>{}, Number<1>{},
// Number<HPerThread>{}, Number<WPerThread>{}));
make_tuple(Number<KPerThread>{}, Number<1>{}));
FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()];
......@@ -147,15 +161,14 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
// loop over k
for(index_t cyx_begin = 0; cyx_begin < CYXPerBlock; cyx_begin += CYXPerThreadLoop)
{
#if 0
#if 1
a_thread_copy.Run(p_a_block + a_block_mtx.CalculateOffset(make_tuple(cyx_begin, 0)) +
mMyThreadOffsetA,
p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, 0)));
p_a_thread);
#else
for(index_t i = 0; i < a_thread_mtx.GetElementSpaceSize(); i++)
p_a_thread[i] = 1;
#endif
threadwise_gemm.Run(p_a_thread, p_b_thread + cyx_begin, p_c_thread);
}
}
......
......@@ -95,32 +95,32 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
// divide block work by [M, N]
#if 1
const auto m_block_work_num = K / Number<KPerBlock>{};
const auto nhw_block_work_num = (N * H * W) / (Number<HPerBlock>{} * Number<WPerBlock>{});
const auto hw_block_work_num = (N * H * W) / (Number<HPerBlock>{} * Number<WPerBlock>{});
const index_t k_block_work_id = get_block_1d_id() / nhw_block_work_num;
const index_t nhw_block_work_id = get_block_1d_id() - k_block_work_id * nhw_block_work_num;
const index_t k_block_work_id = get_block_1d_id() / hw_block_work_num;
const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num;
constexpr auto h_num_threads = HPerBlock / HPerThread;
constexpr auto w_num_threads = WPerBlock / WPerThread;
static_assert(KPerBlock == KPerThread, "");
const auto h_thread_id = get_thread_local_1d_id() / h_num_threads;
const auto h_thread_id = get_thread_local_1d_id() / w_num_threads;
const auto w_thread_id = get_thread_local_1d_id() % w_num_threads;
#else
// Hack: this force result into SGPR
const index_t m_block_work_num = __builtin_amdgcn_readfirstlane(K / KPerBlock);
const index_t nhw_block_work_num = __builtin_amdgcn_readfirstlane(N / HWPerBlock);
const index_t hw_block_work_num = __builtin_amdgcn_readfirstlane(N / HWPerBlock);
const index_t k_block_work_id =
__builtin_amdgcn_readfirstlane(get_block_1d_id() / nhw_block_work_num);
const index_t nhw_block_work_id = get_block_1d_id() - k_block_work_id * nhw_block_work_num;
__builtin_amdgcn_readfirstlane(get_block_1d_id() / hw_block_work_num);
const index_t hw_block_work_id = get_block_1d_id() - k_block_work_id * hw_block_work_num;
#endif
const index_t m_block_data_on_global = k_block_work_id * KPerBlock;
const index_t h_block_data_on_global = nhw_block_work_id * HPerBlock;
const index_t w_block_data_on_global = nhw_block_work_id * WPerBlock;
const index_t h_block_data_on_global = hw_block_work_id * HPerBlock;
const index_t w_block_data_on_global = hw_block_work_id * WPerBlock;
// lds max alignment
constexpr auto max_lds_align =
......@@ -166,7 +166,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
a_cyx_k_block_desc,
make_multi_index(0, 0));
#if 1
constexpr auto b_cyx_n_h_w_thread_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
Number<CYXPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
......@@ -191,7 +190,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
make_multi_index(
0, 0, h_block_data_on_global + h_thread_id, w_block_data_on_global + w_thread_id));
#endif
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
constexpr auto c_k_n_h_w_thread_desc =
......@@ -264,20 +262,20 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
a_blockwise_copy.RunWrite(a_cyx_k_block_desc, p_a_block_double);
#if 0
#if 1
__syncthreads();
//blockwise_gemm.Run(p_a_block_double, p_b_thread_double, p_c_thread);
index_t sum = 0;
for(index_t i = 0; i < b_cyx_n_h_w_thread_desc.GetElementSpaceSize(); i++)
sum += p_b_thread[i];
sum += p_b_thread_double[i];
p_c_thread[0] = get_thread_local_1d_id() * 10000 + sum;
p_c_thread[0] += p_b_thread_double[0] + p_b_thread_double[1] + p_b_thread_double[2];
p_c_thread[0] += p_b_thread_double[3] + p_b_thread_double[4] + p_b_thread_double[5];
p_c_thread[0] += p_b_thread_double[6] + p_b_thread_double[7] + p_b_thread_double[8];
#endif
}
#if 1
#if 0
if constexpr(HasMainKBlockLoop)
{
Float* p_a_block_even = p_a_block_double;
......@@ -398,8 +396,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
{
// define input tensor descriptor for threadwise copy
// thread input tensor, src of threadwise copy
constexpr auto c_k_n_h_w_thread_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<KPerThread>{}, Number<1>{}, Number<1>{}, Number<1>{}));
constexpr auto c_k_n_h_w_thread_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
Number<KPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
// calculate origin of thread input tensor on global memory
// blockwise GEMM c matrix starting index
......@@ -414,8 +413,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
n_block_data_on_global + c_thread_mtx_on_block.col;
#endif
const index_t m_thread_data_on_global = m_block_data_on_global;
const index_t h_thread_data_on_global = h_block_data_on_global + h_thread_id;
const index_t w_thread_data_on_global = w_block_data_on_global + w_thread_id;
const index_t h_thread_data_on_global =
h_block_data_on_global + h_thread_id * HPerThread;
const index_t w_thread_data_on_global =
w_block_data_on_global + w_thread_id * WPerThread;
// hack to control index calculation when iterating over c_k_n_h_w_global tensor
constexpr auto c_k_n_h_w_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
......
......@@ -73,20 +73,20 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr index_t KPerBlock = 16;
constexpr index_t HPerBlock = 8;
constexpr index_t WPerBlock = 8;
constexpr index_t CYXPerBlock = 4;
constexpr index_t CYXPerBlock = 4 * 3 * 3;
constexpr index_t KPerThread = 16;
constexpr index_t HPerThread = 1;
constexpr index_t WPerThread = 1;
constexpr index_t CYXPerThread = 4;
constexpr index_t CYXPerThread = 3 * 3;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<1, 1>;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<9, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<4, 1>;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<36, 1>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 64>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment