debugging

0924d5e5 · Jing Zhang · 5e127c69 · 0924d5e5
Commit 0924d5e5 authored Apr 14, 2021 by Jing Zhang
Show whitespace changes
Inline Side-by-side

Showing with 36 additions and 34 deletions

composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp ...nel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp +36 -34

No files found.
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
@@ -355,7 +355,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
 #endif

        // output: register to global memory
-#if 1
+#if 0
        {
            constexpr auto HoPerThreadx2 = HoPerThread * 2;
            constexpr auto WoPerThreadx2 = WoPerThread * 2;
@@ -480,12 +480,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
                                                                          Number<HoPerThreadx2>{},
                                                                          Number<WoPerThreadx2>{}));

-            constexpr auto vector_len = KPerThread * HoPerThreadx2 * WoPerThreadx2;
+            constexpr auto vector_len = d_k_n_hox2_wox2_thread_desc.GetElementSpaceSize() *
+                                        CThreadTransferDstScalarPerVector;

-            constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
            vector_type<int8_t, vector_len> d_vec;

-            auto d_threadwise_transfer = ThreadwiseDynamicTensorSliceTransfer_v2<
+            constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
+
+            ThreadwiseDynamicTensorSliceTransfer_v2<
                FloatC,
                decltype(d_vec),
                decltype(d_k_n_hox2_wox2_global_desc),
@@ -503,35 +505,16 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
                      make_multi_index(k_thread_data_on_global_add,
                                       0,
                                       hox2_thread_data_on_global,
-                                       wox2_thread_data_on_global));
-
-            auto c_threadwise_transfer = ThreadwiseDynamicTensorSliceTransfer_v1r3<
-                decltype(d_vec),
-                FloatC,
-                decltype(d_k_n_hox2_wox2_thread_desc),
-                decltype(d_k_n_hox2_wox2_global_desc),
-                Sequence<KPerThreadAdd, 1, HoPerThreadx2, WoPerThreadx2>,
-                CThreadTransferSrcDstAccessOrder,
-                CThreadTransferSrcDstVectorDim,
-                // CThreadTransferDstScalarPerVector,
-                1,
-                AddressSpace::Vgpr,
-                AddressSpace::Global,
-                CGlobalMemoryDataOperation,
-                1,
-                true>(d_k_n_hox2_wox2_global_desc,
-                      make_multi_index(k_thread_data_on_global_add,
-                                       0,
-                                       hox2_thread_data_on_global,
-                                       wox2_thread_data_on_global));
-
-            d_threadwise_transfer.Run2(d_k_n_hox2_wox2_global_desc,
+                                       wox2_thread_data_on_global))
+                .Run2(d_k_n_hox2_wox2_global_desc,
                      p_d_global,
                      d_k_n_hox2_wox2_thread_desc,
                      make_tuple(I0, I0, I0, I0),
                      d_vec,
                      c_k_n_ho_wo_global_tensor_iterator_hacks);

+            static_assert(vector_len == 256, "");
+
            static_for<0, vector_len, 1>{}([&](auto i) {
                constexpr auto kpack_i = i % (CThreadTransferDstScalarPerVector);
                constexpr auto khw_i   = i / (CThreadTransferDstScalarPerVector);
@@ -545,7 +528,26 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
                        k_i * CThreadTransferDstScalarPerVector + kpack_i, 0, h_i / 2, w_i / 2))];
            });

-            c_threadwise_transfer.Run2(d_k_n_hox2_wox2_thread_desc,
+            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                decltype(d_vec),
+                FloatC,
+                decltype(d_k_n_hox2_wox2_thread_desc),
+                decltype(d_k_n_hox2_wox2_global_desc),
+                Sequence<KPerThreadAdd, 1, HoPerThreadx2, WoPerThreadx2>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                // CThreadTransferDstScalarPerVector,
+                1,
+                AddressSpace::Vgpr,
+                AddressSpace::Global,
+                CGlobalMemoryDataOperation,
+                1,
+                true>(d_k_n_hox2_wox2_global_desc,
+                      make_multi_index(k_thread_data_on_global_add,
+                                       0,
+                                       hox2_thread_data_on_global,
+                                       wox2_thread_data_on_global))
+                .Run2(d_k_n_hox2_wox2_thread_desc,
                      make_tuple(I0, I0, I0, I0),
                      d_vec,
                      d_k_n_hox2_wox2_global_desc,