debuging with array

a476b4ba · Jing Zhang · 0924d5e5 · a476b4ba · a476b4ba
Commit a476b4ba authored Apr 14, 2021 by Jing Zhang
2 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
@@ -466,7 +466,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
                wox2_block_data_on_global + wo_thread_id * WoPerThreadx2;

            static_assert(KPerThread % CThreadTransferDstScalarPerVector == 0, "");
-            // static_assert(CThreadTransferDstScalarPerVector == 16, "");
+            static_assert(CThreadTransferDstScalarPerVector == 16, "");
            constexpr auto KPerThreadAdd = KPerThread / CThreadTransferDstScalarPerVector;

            const index_t k_block_data_on_global_add =
@@ -480,16 +480,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
                                                                          Number<HoPerThreadx2>{},
                                                                          Number<WoPerThreadx2>{}));

-            constexpr auto vector_len = d_k_n_hox2_wox2_thread_desc.GetElementSpaceSize() *
-                                        CThreadTransferDstScalarPerVector;
-
-            vector_type<int8_t, vector_len> d_vec;
+            FloatC d_vec[d_k_n_hox2_wox2_thread_desc.GetElementSpaceSize()];

            constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};

            ThreadwiseDynamicTensorSliceTransfer_v2<
                FloatC,
-                decltype(d_vec),
+                FloatC,
                decltype(d_k_n_hox2_wox2_global_desc),
                decltype(d_k_n_hox2_wox2_thread_desc),
                Sequence<KPerThreadAdd, 1, HoPerThreadx2, WoPerThreadx2>,
@@ -506,30 +503,42 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
                                       0,
                                       hox2_thread_data_on_global,
                                       wox2_thread_data_on_global))
-                .Run2(d_k_n_hox2_wox2_global_desc,
+                .Run(d_k_n_hox2_wox2_global_desc,
                     p_d_global,
                     d_k_n_hox2_wox2_thread_desc,
                     make_tuple(I0, I0, I0, I0),
                     d_vec,
                     c_k_n_ho_wo_global_tensor_iterator_hacks);

-            static_assert(vector_len == 256, "");
+            for(index_t k_i = 0; k_i < KPerThreadAdd; ++k_i)
+            {
+                for(index_t h_i = 0; h_i < HoPerThreadx2; ++h_i)
+                {
+                    for(index_t w_i = 0; w_i < WoPerThreadx2; ++w_i)
+                    {
+                        vector_type<int8_t, CThreadTransferDstScalarPerVector> t;
+
+                        t.template AsType<FloatC>()(Number<0>{}) =
+                            d_vec[d_k_n_hox2_wox2_thread_desc.CalculateOffset(
+                                make_tuple(k_i, 0, h_i, w_i))];

-            static_for<0, vector_len, 1>{}([&](auto i) {
-                constexpr auto kpack_i = i % (CThreadTransferDstScalarPerVector);
-                constexpr auto khw_i   = i / (CThreadTransferDstScalarPerVector);
-                constexpr auto k_i     = khw_i / (HoPerThreadx2 * WoPerThreadx2);
-                constexpr auto hw_i    = khw_i % (HoPerThreadx2 * WoPerThreadx2);
-                constexpr auto h_i     = hw_i / WoPerThreadx2;
-                constexpr auto w_i     = hw_i % WoPerThreadx2;
-
-                d_vec.template AsType<int8_t>()(i) =
-                    p_c_thread[c_k_n_ho_wo_thread_desc.CalculateOffset(make_tuple(
-                        k_i * CThreadTransferDstScalarPerVector + kpack_i, 0, h_i / 2, w_i / 2))];
+                        static_for<0, CThreadTransferDstScalarPerVector, 1>{}([&](auto i) {
+                            t.template AsType<int8_t>()(i) +=
+                                p_c_thread[c_k_n_ho_wo_thread_desc.CalculateOffset(
+                                    make_tuple(k_i * CThreadTransferDstScalarPerVector + i,
+                                               0,
+                                               h_i / 2,
+                                               w_i / 2))];
                        });

+                        d_vec[d_k_n_hox2_wox2_thread_desc.CalculateOffset(make_tuple(
+                            k_i, 0, h_i, w_i))] = t.template AsType<FloatC>()[Number<0>{}];
+                    }
+                }
+            }
+
            ThreadwiseDynamicTensorSliceTransfer_v1r3<
-                decltype(d_vec),
+                FloatC,
                FloatC,
                decltype(d_k_n_hox2_wox2_thread_desc),
                decltype(d_k_n_hox2_wox2_global_desc),
@@ -547,7 +556,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
                                       0,
                                       hox2_thread_data_on_global,
                                       wox2_thread_data_on_global))
-                .Run2(d_k_n_hox2_wox2_thread_desc,
+                .Run(d_k_n_hox2_wox2_thread_desc,
                     make_tuple(I0, I0, I0, I0),
                     d_vec,
                     d_k_n_hox2_wox2_global_desc,

--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -377,8 +377,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
                    src_desc.CalculateOffset(to_multi_index(src_slice_origin_idx) + dst_data_idx +
                                             i * dst_scalar_step_in_vector);

-                dst_vector.template AsType<DstData>()(i) =
-                    type_convert<DstData>{}(p_src.template AsType<DstData>()[i]);
+                dst_vector.template AsType<DstData>()(i) = p_src.template AsType<DstData>()[i];
            });

            const bool is_dst_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(