Commit 25b71afc authored by Jing Zhang's avatar Jing Zhang
Browse files

debugging add nest loops

parent 4500596a
...@@ -159,6 +159,12 @@ struct GridwiseDynamicGemm_km_kn_mn_v3 ...@@ -159,6 +159,12 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
ABlockTransferSrcScalarPerVector, ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K>{}; ABlockTransferDstScalarPerVector_K>{};
// register allocation for output
FloatAcc p_c_thread[c_k_n_ho_wo_thread_desc.GetElementSpaceSize()];
// zero out threadwise output
threadwise_matrix_set_zero_v3(c_k_n_ho_wo_thread_desc, p_c_thread);
auto c_thread_mtx_index = blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); auto c_thread_mtx_index = blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
const auto k_thread_id = c_thread_mtx_index.k; const auto k_thread_id = c_thread_mtx_index.k;
...@@ -226,12 +232,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3 ...@@ -226,12 +232,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
FloatAB* p_a_block = p_shared_block; FloatAB* p_a_block = p_shared_block;
// register allocation for output
FloatAcc p_c_thread[c_k_n_ho_wo_thread_desc.GetElementSpaceSize()];
// zero out threadwise output
threadwise_matrix_set_zero_v3(c_k_n_ho_wo_thread_desc, p_c_thread);
constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0); constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);
// hack to control index calculation when iterating over A and B matrix for threadwise copy // hack to control index calculation when iterating over A and B matrix for threadwise copy
...@@ -514,29 +514,37 @@ struct GridwiseDynamicGemm_km_kn_mn_v3 ...@@ -514,29 +514,37 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
d_vec, d_vec,
c_k_n_ho_wo_global_tensor_iterator_hacks); c_k_n_ho_wo_global_tensor_iterator_hacks);
#if 1 static_for<0, KPerThreadAdd, 1>{}([&](auto k_i) {
static_for<0, HoPerThreadx2, 1>{}([&](auto h_i) {
static_for<0, d_k_n_hox2_wox2_thread_desc.GetElementSpaceSize(), 1>{}([&](auto j) { static_for<0, WoPerThreadx2, 1>{}([&](auto w_i) {
vector_type<int8_t, CThreadTransferDstScalarPerVector> t; vector_type<int8_t, CThreadTransferDstScalarPerVector> t;
constexpr auto k_i = j / (HoPerThreadx2 * WoPerThreadx2); // t.template AsType<FloatC>()(Number<0>{}) = d_vec.template AsType<
constexpr auto hw_i = j % (HoPerThreadx2 * WoPerThreadx2); // FloatC>()[Number<d_k_n_hox2_wox2_thread_desc.CalculateOffset(
constexpr auto h_i = hw_i / WoPerThreadx2; // make_tuple(k_i, 0, h_i, w_i))>{}];
constexpr auto w_i = hw_i % WoPerThreadx2;
// t.template AsType<FloatC>()(Number<0>{}) = d_vec.template AsType<FloatC>()[j]; t.template AsType<FloatC>()(Number<0>{}) =
t.template AsType<FloatC>()(Number<0>{}) = d_vec[j]; d_vec[Number<d_k_n_hox2_wox2_thread_desc.CalculateOffset(
make_tuple(k_i, 0, h_i, w_i))>{}];
static_for<0, CThreadTransferDstScalarPerVector, 1>{}([&](auto i) { static_for<0, CThreadTransferDstScalarPerVector, 1>{}([&](auto i) {
t.template AsType<int8_t>()(i) += t.template AsType<int8_t>()(i) +=
p_c_thread[c_k_n_ho_wo_thread_desc.CalculateOffset(make_tuple( p_c_thread[c_k_n_ho_wo_thread_desc.CalculateOffset(
k_i * CThreadTransferDstScalarPerVector + i, 0, h_i / 2, w_i / 2))]; make_tuple(k_i * CThreadTransferDstScalarPerVector + i,
0,
h_i / 2,
w_i / 2))];
}); });
// d_vec.template AsType<FloatC>()(j) = t.template AsType<FloatC>()[Number<0>{}]; // d_vec.template AsType<FloatC>()(
d_vec[j] = t.template AsType<FloatC>()[Number<0>{}]; // Number<d_k_n_hox2_wox2_thread_desc.CalculateOffset(make_tuple(
// k_i, 0, h_i, w_i))>{}) = t.template AsType<FloatC>()[Number<0>{}];
d_vec[Number<d_k_n_hox2_wox2_thread_desc.CalculateOffset(make_tuple(
k_i, 0, h_i, w_i))>{}] = t.template AsType<FloatC>()[Number<0>{}];
});
});
}); });
#endif
ThreadwiseDynamicTensorSliceTransfer_v1r3< ThreadwiseDynamicTensorSliceTransfer_v1r3<
// decltype(d_vec), // decltype(d_vec),
......
...@@ -265,7 +265,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3 ...@@ -265,7 +265,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
} }
template <typename SrcSliceOriginIdx, typename DstIteratorHacks> template <typename SrcSliceOriginIdx, typename DstIteratorHacks>
__device__ void Run2(const SrcDesc&, __device__ void Run(const SrcDesc&,
const SrcSliceOriginIdx&, const SrcSliceOriginIdx&,
const SrcData& p_src, const SrcData& p_src,
const DstDesc& dst_desc, const DstDesc& dst_desc,
...@@ -785,7 +785,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2 ...@@ -785,7 +785,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
} }
template <typename DstSliceOriginIdx, typename SrcIteratorHacks> template <typename DstSliceOriginIdx, typename SrcIteratorHacks>
__device__ void Run2(const SrcDesc& src_desc, __device__ void Run(const SrcDesc& src_desc,
const SrcData* p_src, const SrcData* p_src,
const DstDesc&, const DstDesc&,
const DstSliceOriginIdx&, const DstSliceOriginIdx&,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment