Commit 9f633f91 authored by Jing Zhang's avatar Jing Zhang
Browse files

debuging

parent 888b7c78
...@@ -130,7 +130,7 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org ...@@ -130,7 +130,7 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org
constexpr auto a_g_k_m_kpack_block_desc = make_native_tensor_descriptor_aligned( constexpr auto a_g_k_m_kpack_block_desc = make_native_tensor_descriptor_aligned(
Sequence<1, KPerBlock, MPerBlock, KPack>{}, Number<max_align>{}); Sequence<1, KPerBlock, MPerBlock, KPack>{}, Number<max_align>{});
auto a_blockwise_copy = BlockwiseGenericTensorSliceCopy_v4< auto a_blockwise_copy = BlockwiseGenericTensorSliceCopy_v5<
BlockSize, BlockSize,
decltype(a_g_k_m_kpack_global_desc), decltype(a_g_k_m_kpack_global_desc),
decltype(a_g_k_m_kpack_block_desc), decltype(a_g_k_m_kpack_block_desc),
...@@ -225,14 +225,14 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org ...@@ -225,14 +225,14 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org
for(index_t k_block_data_begin = 0; k_block_data_begin < K - KPerBlock; for(index_t k_block_data_begin = 0; k_block_data_begin < K - KPerBlock;
k_block_data_begin += KPerBlock) k_block_data_begin += KPerBlock)
{ {
ABFloat p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()]; // ABFloat p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()];
ABFloat p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()]; ABFloat p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()];
// load next data from device mem // load next data from device mem
a_blockwise_copy.MoveSrcSliceWindow(blockwise_a_copy_src_step, True); a_blockwise_copy.MoveSrcSliceWindow(blockwise_a_copy_src_step, True);
b_blockwise_copy.MoveSrcSliceWindow(blockwise_b_copy_src_step, True); b_blockwise_copy.MoveSrcSliceWindow(blockwise_b_copy_src_step, True);
a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer); a_blockwise_copy.RunLoadThreadBuffer(p_a_global);
b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer); b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
block_sync_lds(); block_sync_lds();
...@@ -250,7 +250,7 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org ...@@ -250,7 +250,7 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2_org
block_sync_lds(); block_sync_lds();
// store next data to LDS // store next data to LDS
a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer, p_a_block); a_blockwise_copy.RunStoreThreadBuffer(p_a_block);
b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, p_b_block); b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, p_b_block);
} }
......
...@@ -95,6 +95,7 @@ struct ThreadwiseGenericTensorSliceCopy_v5 ...@@ -95,6 +95,7 @@ struct ThreadwiseGenericTensorSliceCopy_v5
*reinterpret_cast<SrcData*>(&p_dst[dst_offset]) = src_data; *reinterpret_cast<SrcData*>(&p_dst[dst_offset]) = src_data;
} }
#if 1
template <typename SrcData, index_t SrcDataPerAccess> template <typename SrcData, index_t SrcDataPerAccess>
struct vector_data_load; struct vector_data_load;
...@@ -127,6 +128,20 @@ struct ThreadwiseGenericTensorSliceCopy_v5 ...@@ -127,6 +128,20 @@ struct ThreadwiseGenericTensorSliceCopy_v5
return load_data<float4_t>(p_src, src_coord_begin.GetOffset()); return load_data<float4_t>(p_src, src_coord_begin.GetOffset());
} }
}; };
#else
template <typename SrcData, index_t SrcDataPerAccess>
struct vector_data_load
{
template <typename SrcCoord>
__device__ static auto run(const float* p_src, const SrcCoord src_coord_begin)
{
auto src_offset = src_coord_begin.GetOffset();
return amd_buffer_load<SrcData, SrcDataPerAccess>(
p_src, src_offset, true, SrcDataPerAccess);
}
};
#endif
template <typename DstData, index_t DstDataPerAccess> template <typename DstData, index_t DstDataPerAccess>
struct vector_data_store; struct vector_data_store;
......
...@@ -76,6 +76,7 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( ...@@ -76,6 +76,7 @@ void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
constexpr index_t GemmM = K; constexpr index_t GemmM = K;
constexpr index_t GemmN = N * Ho * Wo; constexpr index_t GemmN = N * Ho * Wo;
constexpr index_t GemmK = C * Y * X;
constexpr index_t GridSize = math::integer_divide_ceil(GemmM, GemmMPerBlock) * constexpr index_t GridSize = math::integer_divide_ceil(GemmM, GemmMPerBlock) *
math::integer_divide_ceil(GemmN, GemmNPerBlock); math::integer_divide_ceil(GemmN, GemmNPerBlock);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment