Commit 2049ab57 authored by Jing Zhang's avatar Jing Zhang
Browse files

fixed output

parent a3edbec9
...@@ -587,7 +587,6 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2 ...@@ -587,7 +587,6 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2
constexpr index_t BlkSize = blockwise_gemm.GetBlkSize(); constexpr index_t BlkSize = blockwise_gemm.GetBlkSize();
constexpr index_t NumBlks = blockwise_gemm.GetNumBlks(); constexpr index_t NumBlks = blockwise_gemm.GetNumBlks();
#if 1
// force unrolling the output loop to get ride of scratches // force unrolling the output loop to get ride of scratches
static_for<0, NumBlks, 1>{}([&](auto blk_id) { static_for<0, NumBlks, 1>{}([&](auto blk_id) {
// calculate origin of thread output tensor on global memory // calculate origin of thread output tensor on global memory
...@@ -618,40 +617,8 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2 ...@@ -618,40 +617,8 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2
m_thread_data_on_global % (M2 * M1) / M2, m_thread_data_on_global % (M2 * M1) / M2,
m_thread_data_on_global % M2, m_thread_data_on_global % M2,
n_thread_data_on_global)) n_thread_data_on_global))
.Store(c_thread_vec, p_c_global); .Store(c_thread_vec.GetVector(Number<BlkSize>{})[Number<blk_id>{}], p_c_global);
}); });
#else
for(index_t i = 0; i < NumBlks; ++i)
{
// calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index
const auto c_thread_mtx_on_block = blockwise_gemm.GetBeginOfThreadMatrixC(i);
const index_t m_thread_data_on_global =
m_block_data_on_global + c_thread_mtx_on_block.row;
const index_t n_thread_data_on_global =
n_block_data_on_global + c_thread_mtx_on_block.col;
ThreadwiseGenericTensorSliceCopy_v4r2<decltype(c_g_m0_m1_m2_n_thread_desc),
decltype(c_g_m0_m1_m2_n_global_desc),
CThreadCopySliceLengths,
arithmetic_sequence_gen<0, 5, 1>::type,
4,
1,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
CGlobalMemoryOp>(
make_multi_index(0, 0, 0, 0, 0),
make_multi_index(g_block_data_on_global,
m_thread_data_on_global / (M2 * M1),
m_thread_data_on_global % (M2 * M1) / M2,
m_thread_data_on_global % M2,
n_thread_data_on_global))
.Run(c_thread_vec.n + i * BlkSize, p_c_global);
}
#endif
} }
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment