"vscode:/vscode.git/clone" did not exist on "d42612375e69d3868286fbfde5e2750050c1734c"
Commit 583aab02 authored by Chao Liu's avatar Chao Liu
Browse files

clean up coordinate step hack

parent 69e771f6
...@@ -107,11 +107,6 @@ template <index_t BlockSize, ...@@ -107,11 +107,6 @@ template <index_t BlockSize,
typename CThreadTransferSrcDstAccessOrder, typename CThreadTransferSrcDstAccessOrder,
index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferSrcDstVectorDim,
index_t CThreadTransferDstScalarPerVector, index_t CThreadTransferDstScalarPerVector,
typename AGridStepHacks,
typename BGridStepHacks,
typename CGridStepHacks,
typename AGridMoveSliceWindowStepHacks,
typename BGridMoveSliceWindowStepHacks,
bool CAccessOrderMRepeatNRepeat, bool CAccessOrderMRepeatNRepeat,
bool ABlockLdsExtraM, bool ABlockLdsExtraM,
bool BBlockLdsExtraN> bool BBlockLdsExtraN>
...@@ -502,15 +497,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5 ...@@ -502,15 +497,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
// hack to control index calculation when iterating over A and B matrix for threadwise copy
constexpr auto a_k0_m_k1_grid_step_hacks = AGridStepHacks{};
constexpr auto b_k0_n_k1_grid_step_hacks = BGridStepHacks{};
// hack to control index calculation when move slice window for A and B matrix for
// threadwise copy
constexpr auto a_k0_m_k1_grid_move_slice_window_step_hack = AGridMoveSliceWindowStepHacks{};
constexpr auto b_k0_n_k1_grid_move_slice_window_step_hack = BGridMoveSliceWindowStepHacks{};
auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>( auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
p_a_block, a_block_desc_k0_m_k1.GetElementSpaceSize()); p_a_block, a_block_desc_k0_m_k1.GetElementSpaceSize());
auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>( auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
...@@ -518,8 +504,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5 ...@@ -518,8 +504,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
// preload data into LDS // preload data into LDS
{ {
a_blockwise_copy.RunRead(a_grid_desc_k0_m_k1, a_grid_buf, a_k0_m_k1_grid_step_hacks); a_blockwise_copy.RunRead(a_grid_desc_k0_m_k1, a_grid_buf);
b_blockwise_copy.RunRead(b_grid_desc_k0_n_k1, b_grid_buf, b_k0_n_k1_grid_step_hacks); b_blockwise_copy.RunRead(b_grid_desc_k0_n_k1, b_grid_buf);
a_blockwise_copy.RunWrite(a_block_desc_k0_m_k1, a_block_buf); a_blockwise_copy.RunWrite(a_block_desc_k0_m_k1, a_block_buf);
b_blockwise_copy.RunWrite(b_block_desc_k0_n_k1, b_block_buf); b_blockwise_copy.RunWrite(b_block_desc_k0_n_k1, b_block_buf);
...@@ -535,20 +521,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5 ...@@ -535,20 +521,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
do do
{ {
a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m_k1, a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m_k1, a_block_slice_copy_step);
a_block_slice_copy_step, b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_k0_n_k1, b_block_slice_copy_step);
a_k0_m_k1_grid_move_slice_window_step_hack);
b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_k0_n_k1,
b_block_slice_copy_step,
b_k0_n_k1_grid_move_slice_window_step_hack);
a_blockwise_copy.RunRead( a_blockwise_copy.RunRead(a_grid_desc_k0_m_k1, a_grid_buf);
a_grid_desc_k0_m_k1, a_grid_buf, a_k0_m_k1_grid_step_hacks);
block_sync_lds(); block_sync_lds();
b_blockwise_copy.RunRead( b_blockwise_copy.RunRead(b_grid_desc_k0_n_k1, b_grid_buf);
b_grid_desc_k0_n_k1, b_grid_buf, b_k0_n_k1_grid_step_hacks);
blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
...@@ -597,8 +577,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5 ...@@ -597,8 +577,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
const index_t n_thread_data_on_grid = const index_t n_thread_data_on_grid =
n_block_data_idx_on_grid + c_thread_mtx_on_block[I1]; n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_tensor_step_hacks = CGridStepHacks{};
const auto m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor = const auto m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor =
make_single_stage_tensor_adaptor( make_single_stage_tensor_adaptor(
make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))), make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
...@@ -651,7 +629,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5 ...@@ -651,7 +629,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r5
c_thread_buf, c_thread_buf,
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
c_grid_buf, c_grid_buf,
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_tensor_step_hacks,
c0_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2, c0_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
c0_grid_buf, c0_grid_buf,
c1_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2, c1_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
......
...@@ -101,11 +101,6 @@ template <index_t BlockSize, ...@@ -101,11 +101,6 @@ template <index_t BlockSize,
typename CThreadTransferSrcDstAccessOrder, typename CThreadTransferSrcDstAccessOrder,
index_t CThreadTransferSrcDstVectorDim, index_t CThreadTransferSrcDstVectorDim,
index_t CThreadTransferDstScalarPerVector, index_t CThreadTransferDstScalarPerVector,
typename AGridStepHacks,
typename BGridStepHacks,
typename CGridStepHacks,
typename AGridMoveSliceWindowStepHacks,
typename BGridMoveSliceWindowStepHacks,
bool CAccessOrderMRepeatNRepeat, bool CAccessOrderMRepeatNRepeat,
bool ABlockLdsExtraM, bool ABlockLdsExtraM,
bool BBlockLdsExtraN> bool BBlockLdsExtraN>
...@@ -488,15 +483,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6 ...@@ -488,15 +483,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
// hack to control index calculation when iterating over A and B matrix for threadwise copy
constexpr auto a_k0_m_k1_grid_step_hacks = AGridStepHacks{};
constexpr auto b_k0_n_k1_grid_step_hacks = BGridStepHacks{};
// hack to control index calculation when move slice window for A and B matrix for
// threadwise copy
constexpr auto a_k0_m_k1_grid_move_slice_window_step_hack = AGridMoveSliceWindowStepHacks{};
constexpr auto b_k0_n_k1_grid_move_slice_window_step_hack = BGridMoveSliceWindowStepHacks{};
auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>( auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
p_a_block, a_block_desc_k0_m_k1.GetElementSpaceSize()); p_a_block, a_block_desc_k0_m_k1.GetElementSpaceSize());
auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>( auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
...@@ -504,8 +490,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6 ...@@ -504,8 +490,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
// preload data into LDS // preload data into LDS
{ {
a_blockwise_copy.RunRead(a_grid_desc_k0_m_k1, a_grid_buf, a_k0_m_k1_grid_step_hacks); a_blockwise_copy.RunRead(a_grid_desc_k0_m_k1, a_grid_buf);
b_blockwise_copy.RunRead(b_grid_desc_k0_n_k1, b_grid_buf, b_k0_n_k1_grid_step_hacks); b_blockwise_copy.RunRead(b_grid_desc_k0_n_k1, b_grid_buf);
a_blockwise_copy.RunWrite(a_block_desc_k0_m_k1, a_block_buf); a_blockwise_copy.RunWrite(a_block_desc_k0_m_k1, a_block_buf);
b_blockwise_copy.RunWrite(b_block_desc_k0_n_k1, b_block_buf); b_blockwise_copy.RunWrite(b_block_desc_k0_n_k1, b_block_buf);
...@@ -522,19 +508,17 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6 ...@@ -522,19 +508,17 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
do do
{ {
a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m_k1, a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m_k1,
a_block_slice_copy_step, a_block_slice_copy_step);
a_k0_m_k1_grid_move_slice_window_step_hack);
b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_k0_n_k1, b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_k0_n_k1,
b_block_slice_copy_step, b_block_slice_copy_step);
b_k0_n_k1_grid_move_slice_window_step_hack);
a_blockwise_copy.RunRead( a_blockwise_copy.RunRead(
a_grid_desc_k0_m_k1, a_grid_buf, a_k0_m_k1_grid_step_hacks); a_grid_desc_k0_m_k1, a_grid_buf);
block_sync_lds(); block_sync_lds();
b_blockwise_copy.RunRead( b_blockwise_copy.RunRead(
b_grid_desc_k0_n_k1, b_grid_buf, b_k0_n_k1_grid_step_hacks); b_grid_desc_k0_n_k1, b_grid_buf);
blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
...@@ -583,8 +567,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6 ...@@ -583,8 +567,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
const index_t n_thread_data_on_grid = const index_t n_thread_data_on_grid =
n_block_data_idx_on_grid + c_thread_mtx_on_block[I1]; n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_grid_tensor_step_hacks = CGridStepHacks{};
const auto m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor = const auto m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor =
make_single_stage_tensor_adaptor( make_single_stage_tensor_adaptor(
make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))), make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
...@@ -635,7 +617,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6 ...@@ -635,7 +617,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r6
c_thread_buf, c_thread_buf,
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
c_grid_buf, c_grid_buf,
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_tensor_step_hacks,
c0_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2, c0_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
c0_grid_buf); c0_grid_buf);
} }
......
...@@ -397,14 +397,12 @@ struct ThreadwiseTensorSliceTransfer_v1r4 ...@@ -397,14 +397,12 @@ struct ThreadwiseTensorSliceTransfer_v1r4
typename SrcBuffer, typename SrcBuffer,
typename DstBuffer, typename DstBuffer,
typename Dst0Buffer, typename Dst0Buffer,
typename Dst1Buffer, typename Dst1Buffer>
typename DstStepHacks>
__device__ void Run(const SrcDesc&, __device__ void Run(const SrcDesc&,
const SrcSliceOriginIdx&, const SrcSliceOriginIdx&,
const SrcBuffer& src_buf, const SrcBuffer& src_buf,
const DstDesc& dst_desc, const DstDesc& dst_desc,
DstBuffer& dst_buf, DstBuffer& dst_buf,
const DstStepHacks& dst_step_hacks,
const Dst0Desc& dst0_desc, const Dst0Desc& dst0_desc,
const Dst0Buffer& dst0_buf, const Dst0Buffer& dst0_buf,
const Dst1Desc& dst1_desc, const Dst1Desc& dst1_desc,
...@@ -427,7 +425,7 @@ struct ThreadwiseTensorSliceTransfer_v1r4 ...@@ -427,7 +425,7 @@ struct ThreadwiseTensorSliceTransfer_v1r4
src_buf, src_buf,
dst_desc, dst_desc,
dst_buf, dst_buf,
dst_step_hacks, f_step_hacks(dst_desc),
dst0_desc, dst0_desc,
dst0_buf, dst0_buf,
f_step_hacks(dst0_desc), f_step_hacks(dst0_desc),
......
...@@ -339,14 +339,12 @@ struct ThreadwiseTensorSliceTransfer_v1r5 ...@@ -339,14 +339,12 @@ struct ThreadwiseTensorSliceTransfer_v1r5
template <typename SrcSliceOriginIdx, template <typename SrcSliceOriginIdx,
typename SrcBuffer, typename SrcBuffer,
typename DstBuffer, typename DstBuffer,
typename Dst0Buffer, typename Dst0Buffer>
typename DstStepHacks>
__device__ void Run(const SrcDesc&, __device__ void Run(const SrcDesc&,
const SrcSliceOriginIdx&, const SrcSliceOriginIdx&,
const SrcBuffer& src_buf, const SrcBuffer& src_buf,
const DstDesc& dst_desc, const DstDesc& dst_desc,
DstBuffer& dst_buf, DstBuffer& dst_buf,
const DstStepHacks& dst_step_hacks,
const Dst0Desc& dst0_desc, const Dst0Desc& dst0_desc,
const Dst0Buffer& dst0_buf) const Dst0Buffer& dst0_buf)
{ {
...@@ -367,7 +365,7 @@ struct ThreadwiseTensorSliceTransfer_v1r5 ...@@ -367,7 +365,7 @@ struct ThreadwiseTensorSliceTransfer_v1r5
src_buf, src_buf,
dst_desc, dst_desc,
dst_buf, dst_buf,
dst_step_hacks, f_step_hacks(dst_desc),
dst0_desc, dst0_desc,
dst0_buf, dst0_buf,
f_step_hacks(dst0_desc)); f_step_hacks(dst0_desc));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment