Commit 88d51698 authored by root's avatar root
Browse files

debug

parent 758f576a
...@@ -209,7 +209,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad ...@@ -209,7 +209,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
decltype(a_k_m_global_move_slice_window_iterator_hack), decltype(a_k_m_global_move_slice_window_iterator_hack),
decltype(b_k_n_global_move_slice_window_iterator_hack)>; decltype(b_k_n_global_move_slice_window_iterator_hack)>;
const auto GridSize = (K / KPerBlock) * (Ho / HPerBlock) * (Wo / WPerBlock); const auto GridSize = (K / KPerBlock) * (Ho / HPerBlock) * (Wo / WPerBlock) * N;
const bool has_main_k_block_loop = (CYX + CYXPerBlock) / (2 * CYXPerBlock) > 1; const bool has_main_k_block_loop = (CYX + CYXPerBlock) / (2 * CYXPerBlock) > 1;
......
...@@ -110,16 +110,18 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -110,16 +110,18 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
__device__ void __device__ void
Run_naive(const FloatA* p_a_block, const FloatB* p_b_thread, FloatC* p_c_thread) const Run_naive(const FloatA* p_a_block, const FloatB* p_b_thread, FloatC* p_c_thread) const
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto a_block_mtx = BlockMatrixA{}; constexpr auto a_block_mtx = BlockMatrixA{};
constexpr auto b_block_mtx = BlockMatrixB{};
constexpr auto CYXPerBlock = a_block_mtx.GetLength(I0); constexpr auto CYXPerBlock = a_block_mtx.GetLength(I0);
static_assert(CYXPerBlock == CYXPerThreadLoop, "");
// thread A, B for GEMM // thread A, B for GEMM
constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<CYXPerThreadLoop>{}, Number<KPerThread>{})); make_tuple(Number<CYXPerThreadLoop>{}, Number<KPerThread>{}));
...@@ -145,11 +147,16 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -145,11 +147,16 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
// loop over k // loop over k
for(index_t cyx_begin = 0; cyx_begin < CYXPerBlock; cyx_begin += CYXPerThreadLoop) for(index_t cyx_begin = 0; cyx_begin < CYXPerBlock; cyx_begin += CYXPerThreadLoop)
{ {
#if 1
a_thread_copy.Run(p_a_block + a_block_mtx.CalculateOffset(make_tuple(cyx_begin, 0)) + a_thread_copy.Run(p_a_block + a_block_mtx.CalculateOffset(make_tuple(cyx_begin, 0)) +
mMyThreadOffsetA, mMyThreadOffsetA,
p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, 0))); p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, 0)));
#else
for(index_t i = 0; i < a_thread_mtx.GetElementSpaceSize(); i++)
p_a_thread[i] = 1;
#endif
threadwise_gemm.Run(p_a_thread, p_b_thread + CYXPerThreadLoop * cyx_begin, p_c_thread); threadwise_gemm.Run(p_a_thread, p_b_thread + cyx_begin, p_c_thread);
} }
} }
......
...@@ -88,8 +88,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -88,8 +88,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
const auto CYX = a_cyx_k_global_desc.GetLength(I0); const auto CYX = a_cyx_k_global_desc.GetLength(I0);
const auto K = a_cyx_k_global_desc.GetLength(I1); const auto K = a_cyx_k_global_desc.GetLength(I1);
static_assert(CYX == 4 * 3 * 3 && K == 16, "");
const auto N = b_cyx_n_h_w_global_desc.GetLength(I1); const auto N = b_cyx_n_h_w_global_desc.GetLength(I1);
const auto H = b_cyx_n_h_w_global_desc.GetLength(I2); const auto H = b_cyx_n_h_w_global_desc.GetLength(I2);
const auto W = b_cyx_n_h_w_global_desc.GetLength(I3); const auto W = b_cyx_n_h_w_global_desc.GetLength(I3);
...@@ -102,6 +100,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -102,6 +100,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
const index_t k_block_work_id = get_block_1d_id() / nhw_block_work_num; const index_t k_block_work_id = get_block_1d_id() / nhw_block_work_num;
const index_t nhw_block_work_id = get_block_1d_id() - k_block_work_id * nhw_block_work_num; const index_t nhw_block_work_id = get_block_1d_id() - k_block_work_id * nhw_block_work_num;
constexpr auto h_num_threads = HPerBlock / HPerThread;
constexpr auto w_num_threads = WPerBlock / WPerThread;
static_assert(KPerBlock == KPerThread, "");
const auto h_thread_id = get_thread_local_1d_id() / h_num_threads;
const auto w_thread_id = get_thread_local_1d_id() % w_num_threads;
#else #else
// Hack: this force result into SGPR // Hack: this force result into SGPR
const index_t m_block_work_num = __builtin_amdgcn_readfirstlane(K / KPerBlock); const index_t m_block_work_num = __builtin_amdgcn_readfirstlane(K / KPerBlock);
...@@ -114,8 +119,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -114,8 +119,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
const index_t m_block_data_on_global = k_block_work_id * KPerBlock; const index_t m_block_data_on_global = k_block_work_id * KPerBlock;
const index_t h_block_data_on_global = nhw_block_work_id * 8; const index_t h_block_data_on_global = nhw_block_work_id * HPerBlock;
const index_t w_block_data_on_global = nhw_block_work_id * 8; const index_t w_block_data_on_global = nhw_block_work_id * WPerBlock;
// lds max alignment // lds max alignment
constexpr auto max_lds_align = constexpr auto max_lds_align =
...@@ -128,9 +133,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -128,9 +133,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
// B matrix in LDS memory, dst of blockwise copy // B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto b_cyx_n_h_w_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( constexpr auto b_cyx_n_h_w_block_desc =
make_tuple(Number<CYXPerBlock>{}, Number<1>{}, Number<8>{}, Number<8>{}), make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
max_lds_align); Number<CYXPerBlock>{}, Number<1>{}, Number<HPerBlock>{}, Number<WPerBlock>{}));
// A matrix blockwise copy // A matrix blockwise copy
auto a_blockwise_copy = auto a_blockwise_copy =
...@@ -162,18 +167,16 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -162,18 +167,16 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
make_multi_index(0, 0)); make_multi_index(0, 0));
#if 1 #if 1
constexpr auto b_cyx_n_h_w_thread_desc = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto b_cyx_n_h_w_thread_desc =
make_tuple(Number<CYXPerBlock>{}, Number<1>{}, Number<1>{}, Number<1>{})); make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
Number<CYXPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
const index_t h_thread_id = get_thread_local_1d_id() / 8;
const index_t w_thread_id = get_thread_local_1d_id() % 8;
using ThreadwiseTensorSliceTransferB = ThreadwiseDynamicTensorSliceTransfer_v2< using ThreadwiseTensorSliceTransferB = ThreadwiseDynamicTensorSliceTransfer_v2<
Float, Float,
Float, Float,
decltype(b_cyx_n_h_w_global_desc), decltype(b_cyx_n_h_w_global_desc),
decltype(b_cyx_n_h_w_thread_desc), decltype(b_cyx_n_h_w_thread_desc),
Sequence<CYXPerBlock, 1, 1, 1>, Sequence<CYXPerThread, 1, HPerThread, WPerThread>,
Sequence<3, 2, 0, 1>, // BBlockTransferSrcAccessOrder, Sequence<3, 2, 0, 1>, // BBlockTransferSrcAccessOrder,
3, // BBlockTransferSrcVectorDim, 3, // BBlockTransferSrcVectorDim,
1, // BBlockTransferSrcScalarPerVector, 1, // BBlockTransferSrcScalarPerVector,
...@@ -191,8 +194,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -191,8 +194,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
#endif #endif
// c_thread_mtx definition: this is a mess // c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx // TODO:: more elegent way of defining c_thread_mtx
constexpr auto c_k_n_h_w_thread_desc = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto c_k_n_h_w_thread_desc =
make_tuple(Number<KPerThread>{}, Number<1>{}, Number<1>{}, Number<1>{})); make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
Number<KPerThread>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
#if 1 #if 1
const auto blockwise_gemm = const auto blockwise_gemm =
...@@ -200,14 +204,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -200,14 +204,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
decltype(a_cyx_k_block_desc), decltype(a_cyx_k_block_desc),
decltype(b_cyx_n_h_w_block_desc), decltype(b_cyx_n_h_w_block_desc),
decltype(c_k_n_h_w_thread_desc), decltype(c_k_n_h_w_thread_desc),
16, // KPerThreadSubC KPerThread, // KPerThreadSubC
1, // HPerThreadSubC HPerThread, // HPerThreadSubC
1, // WPerThreadSubC WPerThread, // WPerThreadSubC
1, // CYXPerThreadLoop CYXPerThread, // CYXPerThreadLoop
8, // HThreadCluster h_num_threads, // HThreadCluster
8, // WThreadCluster w_num_threads, // WThreadCluster
1, // ThreadGemmADataPerRead_K 1, // ThreadGemmADataPerRead_K
1 // ThreadGemmBDataPerRead_W 1 // ThreadGemmBDataPerRead_W
>{}; >{};
#endif #endif
...@@ -232,7 +236,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -232,7 +236,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
// threadwise_matrix_set_zero_v2(c_k_n_h_w_thread_desc, p_c_thread); // threadwise_matrix_set_zero_v2(c_k_n_h_w_thread_desc, p_c_thread);
constexpr auto a_block_slice_copy_step = make_multi_index(CYXPerBlock, 0); constexpr auto a_block_slice_copy_step = make_multi_index(CYXPerBlock, 0);
constexpr auto b_block_slice_copy_step = make_multi_index(CYXPerBlock, 0, 0, 0); // constexpr auto b_block_slice_copy_step = make_multi_index(CYXPerBlock, 0, 0, 0);
// hack to control index calculation when iterating over A and B matrix for threadwise copy // hack to control index calculation when iterating over A and B matrix for threadwise copy
constexpr auto a_k_m_global_iterator_hacks = AGlobalIteratorHacks{}; constexpr auto a_k_m_global_iterator_hacks = AGlobalIteratorHacks{};
...@@ -245,14 +249,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -245,14 +249,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
constexpr auto b_cyx_n_h_w_global_move_slice_window_iterator_hack = constexpr auto b_cyx_n_h_w_global_move_slice_window_iterator_hack =
BGlobalMoveSliceWindowIteratorHacks{}; BGlobalMoveSliceWindowIteratorHacks{};
Float p_b_thread[b_cyx_n_h_w_thread_desc.GetElementSpaceSize()];
// LDS double buffer: preload data into LDS // LDS double buffer: preload data into LDS
{ {
a_blockwise_copy.RunRead(a_cyx_k_global_desc, p_a_global, a_k_m_global_iterator_hacks); a_blockwise_copy.RunRead(a_cyx_k_global_desc, p_a_global, a_k_m_global_iterator_hacks);
constexpr auto b_thread_mtx = b_cyx_n_h_w_thread_desc;
Float p_b_thread[b_thread_mtx.GetElementSpaceSize()];
b_threadwise_transfer.Run(b_cyx_n_h_w_global_desc, b_threadwise_transfer.Run(b_cyx_n_h_w_global_desc,
p_b_global, p_b_global,
b_cyx_n_h_w_thread_desc, b_cyx_n_h_w_thread_desc,
...@@ -264,14 +267,19 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -264,14 +267,19 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
__syncthreads(); __syncthreads();
blockwise_gemm.Run(p_a_block_double, p_b_thread, p_c_thread); //blockwise_gemm.Run(p_a_block_double, p_b_thread, p_c_thread);
index_t sum = 0;
for(index_t i = 0; i < b_cyx_n_h_w_thread_desc.GetElementSpaceSize(); i++)
sum += p_b_thread[i];
p_c_thread[0] = get_thread_local_1d_id() * 10000 + sum;
} }
#if 0 #if 0
if constexpr(HasMainKBlockLoop) if constexpr(HasMainKBlockLoop)
{ {
Float* p_a_block_even = p_a_block_double; Float* p_a_block_even = p_a_block_double;
Float* p_a_block_odd = p_a_block_double + a_block_space_size; Float* p_a_block_odd = p_a_block_double + a_block_space_size;
index_t b_block_data_begin = 0; index_t b_block_data_begin = 0;
...@@ -371,9 +379,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -371,9 +379,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
const index_t n_thread_data_on_global = const index_t n_thread_data_on_global =
n_block_data_on_global + c_thread_mtx_on_block.col; n_block_data_on_global + c_thread_mtx_on_block.col;
#endif #endif
const index_t h_thread_id = get_thread_local_1d_id() / 8;
const index_t w_thread_id = get_thread_local_1d_id() % 8;
const index_t m_thread_data_on_global = m_block_data_on_global; const index_t m_thread_data_on_global = m_block_data_on_global;
const index_t h_thread_data_on_global = h_block_data_on_global + h_thread_id; const index_t h_thread_data_on_global = h_block_data_on_global + h_thread_id;
const index_t w_thread_data_on_global = w_block_data_on_global + w_thread_id; const index_t w_thread_data_on_global = w_block_data_on_global + w_thread_id;
......
...@@ -78,7 +78,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc ...@@ -78,7 +78,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr index_t KPerThread = 16; constexpr index_t KPerThread = 16;
constexpr index_t HPerThread = 1; constexpr index_t HPerThread = 1;
constexpr index_t WPerThread = 1; constexpr index_t WPerThread = 1;
constexpr index_t CYXPerThread = 4; constexpr index_t CYXPerThread = 4 * 3 * 3;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<9, 1>; using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<9, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>; using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>;
...@@ -86,8 +86,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc ...@@ -86,8 +86,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1; constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1; constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<4, 1>; using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<36, 1>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<9, 64>; using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 64>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment