"...composable_kernel.git" did not exist on "4daedf8ca56f3bd93481708bd9d762045839ec20"
Commit 4500596a authored by Jing Zhang's avatar Jing Zhang
Browse files

debugging with array

parent a476b4ba
...@@ -246,9 +246,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3 ...@@ -246,9 +246,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
BGlobalMoveSliceWindowIteratorHacks{}; BGlobalMoveSliceWindowIteratorHacks{};
constexpr auto b_thread_space_size = b_e_n_ho_wo_thread_desc.GetElementSpaceSize(); constexpr auto b_thread_space_size = b_e_n_ho_wo_thread_desc.GetElementSpaceSize();
FloatAB p_b_thread[b_thread_space_size * 2]; FloatAB p_b_thread_double[b_thread_space_size * 2];
FloatAB* p_b_thread_double = p_b_thread;
// LDS double buffer: preload data into LDS // LDS double buffer: preload data into LDS
{ {
...@@ -480,12 +478,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v3 ...@@ -480,12 +478,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
Number<HoPerThreadx2>{}, Number<HoPerThreadx2>{},
Number<WoPerThreadx2>{})); Number<WoPerThreadx2>{}));
constexpr auto vec_len = d_k_n_hox2_wox2_thread_desc.GetElementSpaceSize() *
CThreadTransferDstScalarPerVector;
static_assert(vec_len == 256, "");
// vector_type<int8_t, vec_len> d_vec;
FloatC d_vec[d_k_n_hox2_wox2_thread_desc.GetElementSpaceSize()]; FloatC d_vec[d_k_n_hox2_wox2_thread_desc.GetElementSpaceSize()];
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{}; constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
ThreadwiseDynamicTensorSliceTransfer_v2< ThreadwiseDynamicTensorSliceTransfer_v2<
FloatC, FloatC,
// decltype(d_vec),
FloatC, FloatC,
decltype(d_k_n_hox2_wox2_global_desc), decltype(d_k_n_hox2_wox2_global_desc),
decltype(d_k_n_hox2_wox2_thread_desc), decltype(d_k_n_hox2_wox2_thread_desc),
...@@ -510,34 +514,32 @@ struct GridwiseDynamicGemm_km_kn_mn_v3 ...@@ -510,34 +514,32 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
d_vec, d_vec,
c_k_n_ho_wo_global_tensor_iterator_hacks); c_k_n_ho_wo_global_tensor_iterator_hacks);
for(index_t k_i = 0; k_i < KPerThreadAdd; ++k_i) #if 1
{
for(index_t h_i = 0; h_i < HoPerThreadx2; ++h_i)
{
for(index_t w_i = 0; w_i < WoPerThreadx2; ++w_i)
{
vector_type<int8_t, CThreadTransferDstScalarPerVector> t;
t.template AsType<FloatC>()(Number<0>{}) = static_for<0, d_k_n_hox2_wox2_thread_desc.GetElementSpaceSize(), 1>{}([&](auto j) {
d_vec[d_k_n_hox2_wox2_thread_desc.CalculateOffset( vector_type<int8_t, CThreadTransferDstScalarPerVector> t;
make_tuple(k_i, 0, h_i, w_i))];
static_for<0, CThreadTransferDstScalarPerVector, 1>{}([&](auto i) { constexpr auto k_i = j / (HoPerThreadx2 * WoPerThreadx2);
t.template AsType<int8_t>()(i) += constexpr auto hw_i = j % (HoPerThreadx2 * WoPerThreadx2);
p_c_thread[c_k_n_ho_wo_thread_desc.CalculateOffset( constexpr auto h_i = hw_i / WoPerThreadx2;
make_tuple(k_i * CThreadTransferDstScalarPerVector + i, constexpr auto w_i = hw_i % WoPerThreadx2;
0,
h_i / 2,
w_i / 2))];
});
d_vec[d_k_n_hox2_wox2_thread_desc.CalculateOffset(make_tuple( // t.template AsType<FloatC>()(Number<0>{}) = d_vec.template AsType<FloatC>()[j];
k_i, 0, h_i, w_i))] = t.template AsType<FloatC>()[Number<0>{}]; t.template AsType<FloatC>()(Number<0>{}) = d_vec[j];
}
} static_for<0, CThreadTransferDstScalarPerVector, 1>{}([&](auto i) {
} t.template AsType<int8_t>()(i) +=
p_c_thread[c_k_n_ho_wo_thread_desc.CalculateOffset(make_tuple(
k_i * CThreadTransferDstScalarPerVector + i, 0, h_i / 2, w_i / 2))];
});
// d_vec.template AsType<FloatC>()(j) = t.template AsType<FloatC>()[Number<0>{}];
d_vec[j] = t.template AsType<FloatC>()[Number<0>{}];
});
#endif
ThreadwiseDynamicTensorSliceTransfer_v1r3< ThreadwiseDynamicTensorSliceTransfer_v1r3<
// decltype(d_vec),
FloatC, FloatC,
FloatC, FloatC,
decltype(d_k_n_hox2_wox2_thread_desc), decltype(d_k_n_hox2_wox2_thread_desc),
......
...@@ -416,7 +416,7 @@ struct vector_type<T, 256> ...@@ -416,7 +416,7 @@ struct vector_type<T, 256>
{ {
d256_t d256_; d256_t d256_;
StaticallyIndexedArray<d1_t, 256> d1x256_; StaticallyIndexedArray<d1_t, 256> d1x256_;
StaticallyIndexedArray<d16_t, 1> d16x16_; StaticallyIndexedArray<d16_t, 16> d16x16_;
StaticallyIndexedArray<d256_t, 1> d16x1_; StaticallyIndexedArray<d256_t, 1> d16x1_;
} data_; } data_;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment