refactor

e131f6aa · Chao Liu · f64fab12 · e131f6aa · e131f6aa · e131f6aa
Commit e131f6aa authored Apr 18, 2020 by Chao Liu
3 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -164,7 +164,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
        constexpr index_t KBlockWork = K / KPerBlock;
        constexpr index_t BBlockWork = B / BPerBlock;

-#if 0
        constexpr auto block_work_desc =
            make_cluster_descriptor(Sequence<KBlockWork, BBlockWork>{});

@@ -172,15 +171,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer

        const index_t k_block_data_on_global = block_work_id[0] * KPerBlock;
        const index_t b_block_data_on_global = block_work_id[1] * BPerBlock;
-#else
-        constexpr auto block_work_desc =
-            make_cluster_descriptor(Sequence<BBlockWork, KBlockWork>{});
-
-        const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
-
-        const index_t b_block_data_on_global = block_work_id[0] * BPerBlock;
-        const index_t k_block_data_on_global = block_work_id[1] * KPerBlock;
-#endif

        // input tensor
        //     global tensor in global memory

--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -56,11 +56,6 @@ struct BlockwiseGenericTensorSliceCopy_v4
        constexpr auto thread_cluster_desc =
            make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});

-#if 0
-        static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
-                      "wrong! BlockSize not consistent with ThreadClusterLengths");
-#endif
-
        const auto thread_cluster_id =
            thread_cluster_desc.CalculateClusterIndex(get_thread_local_1d_id());

@@ -88,7 +83,19 @@ struct BlockwiseGenericTensorSliceCopy_v4
        constexpr auto thread_cluster_desc =
            make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});

-        if(get_thread_local_1d_id() < thread_cluster_desc.GetElementSize())
+        if(BlockSize == thread_cluster_desc.GetElementSize())
+        {
+            // TODO: threadwise copy is still being tweaked
+            if(has_optimized_address_calculation)
+            {
+                mThreadwiseLoad.Run_optimized_src_address_calculation(p_block_src, p_thread_buffer);
+            }
+            else
+            {
+                mThreadwiseLoad.Run(p_block_src, p_thread_buffer);
+            }
+        }
+        else if(get_thread_local_1d_id() < thread_cluster_desc.GetElementSize())
        {
            // TODO: threadwise copy is still being tweaked
            if(has_optimized_address_calculation)
@@ -112,7 +119,20 @@ struct BlockwiseGenericTensorSliceCopy_v4
        constexpr auto thread_cluster_desc =
            make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});

-        if(get_thread_local_1d_id() < thread_cluster_desc.GetElementSize())
+        if(BlockSize == thread_cluster_desc.GetElementSize())
+        {
+            // TODO: threadwise copy is still being tweaked
+            if(has_optimized_address_calculation)
+            {
+                mThreadwiseStore.Run_optimized_dst_address_calculation(p_thread_buffer,
+                                                                       p_block_dst);
+            }
+            else
+            {
+                mThreadwiseStore.Run(p_thread_buffer, p_block_dst);
+            }
+        }
+        else if(get_thread_local_1d_id() < thread_cluster_desc.GetElementSize())
        {
            // TODO: threadwise copy is still being tweaked
            if(has_optimized_address_calculation)

--- a/composable_kernel/include/tensor_operation/gridwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm.hpp
@@ -111,7 +111,6 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
        constexpr index_t MBlockWork = M / MPerBlock;
        constexpr index_t NBlockWork = N / NPerBlock;

-#if 1
        constexpr auto block_work_desc =
            make_cluster_descriptor(Sequence<MBlockWork, NBlockWork>{});

@@ -119,15 +118,6 @@ struct GridwiseGemmTransposedANormalBNormalC_v1

        const index_t m_block_data_on_global = block_work_id[0] * MPerBlock;
        const index_t n_block_data_on_global = block_work_id[1] * NPerBlock;
-#else
-        constexpr auto block_work_desc =
-            make_cluster_descriptor(Sequence<NBlockWork, MBlockWork>{});
-
-        const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
-
-        const index_t n_block_data_on_global = block_work_id[0] * NPerBlock;
-        const index_t m_block_data_on_global = block_work_id[1] * MPerBlock;
-#endif

        // A matrix in LDS memory, dst of blockwise copy
        //   be careful of LDS alignment