added dyanmic col2im

7da654a4 · Chao Liu · 9d000309 · 7da654a4 · 7da654a4 · 7da654a4
Commit 7da654a4 authored Oct 25, 2020 by Chao Liu
9 changed files
--- a/composable_kernel/include/kernel_algorithm/dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp
+++ b/composable_kernel/include/kernel_algorithm/dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp
@@ -119,6 +119,7 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
        // blockwise atomic accumulation
        auto blockwise_copy =
+#if 0
            BlockwiseDynamicTensorSliceTransfer_v1<BlockSize,
                                                   float,
                                                   float,
@@ -129,8 +130,6 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
                                                   BlockCopyClusterLengths_GemmK_GemmN,
                                                   BlockCopyThreadClusterArrangeOrder,
                                                   BlockCopySrcAccessOrder,
-                                                   BlockCopyDstAccessOrder,
-                                                   1,
                                                   1,
                                                   BlockCopyDataPerAccess_GemmN,
                                                   BlockCopyDataPerAccess_GemmN,
@@ -139,32 +138,39 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
                                                   InMemoryDataOperation::AtomicAdd,
                                                   1,
                                                   1>(
+#else
+                 BlockwiseDynamicTensorSliceTransfer_v2<BlockSize,
+                                                        float,
+                                                        float,
+                                                        decltype(col_gemmk_gemmn_global_desc),
+                                                        decltype(img_gemmk_gemmn_global_desc),
+                                                        Sequence<GemmKPerBlock, GemmNPerBlock>,
+                                                        BlockCopySubLengths_GemmK_GemmN,
+                                                        BlockCopyClusterLengths_GemmK_GemmN,
+                                                        BlockCopyThreadClusterArrangeOrder,
+                                                        BlockCopySrcAccessOrder,
+                                                        BlockCopyDstAccessOrder,
+                                                        1,
+                                                        1,
+                                                        BlockCopyDataPerAccess_GemmN,
+                                                        BlockCopyDataPerAccess_GemmN,
+                                                        AddressSpace::Global,
+                                                        AddressSpace::Global,
+                                                        InMemoryDataOperation::AtomicAdd,
+                                                        1,
+                                                        1>(
+#endif
                col_gemmk_gemmn_global_desc,
                make_multi_index(0, gemmn_block_data_on_global),
                img_gemmk_gemmn_global_desc,
                make_multi_index(0, gemmn_block_data_on_global));
-        auto col_gemmk_gemmn_coord =
+        for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
-            make_dynamic_tensor_coordinate(col_gemmk_gemmn_global_desc, make_multi_index(0, 0));
-        auto img_gemmk_gemmn_coord =
-            make_dynamic_tensor_coordinate(img_gemmk_gemmn_global_desc, make_multi_index(0, 0));
-        const auto col_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step(
-            col_gemmk_gemmn_global_desc, make_multi_index(GemmKPerBlock, 0));
-        const auto img_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step(
-            img_gemmk_gemmn_global_desc, make_multi_index(GemmKPerBlock, 0));
-        for(index_t gemmk = 0; gemmk < GemmK - GemmKPerBlock; gemmk += GemmKPerBlock)
        {
            blockwise_copy.Run(p_col_global, p_img_global);
-            move_dynamic_tensor_coordinate(
+            blockwise_copy.MoveSrcSliceWindow(make_multi_index(GemmKPerBlock, 0));
-                col_gemmk_gemmn_global_desc, col_gemmk_gemmn_coord, col_gemmk_gemmn_coord_step);
+            blockwise_copy.MoveDstSliceWindow(make_multi_index(GemmKPerBlock, 0));
-            move_dynamic_tensor_coordinate(
-                img_gemmk_gemmn_global_desc, img_gemmk_gemmn_coord, img_gemmk_gemmn_coord_step);
        }
    }
 };

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -459,7 +459,7 @@ struct DynamicMerge
            // this should be saved in SGPR as well
            index_t idx_low_length_minus_idx_diff_low_const =
                low_lengths_[i] - idx_diff_low_const[i];
-#if 0
+#if 1
            index_t idx_low_length_plus_idx_diff_low_const =
                low_lengths_[i] + idx_diff_low_const[i];
 #endif
@@ -467,21 +467,20 @@ struct DynamicMerge
            index_t idx_low_tmp = idx_low_old[i] + carry;
            bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const;
-#if 0
+#if 1
            bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
 #endif
            idx_diff_low(i) =
                do_carry ? -idx_low_length_minus_idx_diff_low_const : idx_diff_low_const[i];
-#if 0
+#if 1
-            idx_diff_low(i) =
+            idx_diff_low(i) = do_borrow ? idx_low_length_plus_idx_diff_low_const : idx_diff_low[i];
-                do_borrow ? idx_low_length_plus_idx_diff_low_const : idx_diff_low[i];
 #endif
            idx_diff_low(i) += carry;
            carry = do_carry ? 1 : 0;
-#if 0
+#if 1
            carry = do_borrow ? -1 : carry;
 #endif
        });

--- a/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
@@ -9,6 +9,116 @@
 namespace ck {
+template <index_t BlockSize,
+          typename BlockSrcData,
+          typename BlockDstData,
+          typename BlockSrcDesc,
+          typename BlockDstDesc,
+          typename BlockSliceLengths,
+          typename ThreadSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDstDimAccessOrder,
+          index_t SrcDstVectoReadDim,
+          index_t SrcDataPerRead,
+          index_t DstDataPerWrite,
+          AddressSpace SrcAddressSpace,
+          AddressSpace DstAddressSpace,
+          InMemoryDataOperation DstInMemOp,
+          index_t SrcDataStride,
+          index_t DstDataStride>
+struct BlockwiseDynamicTensorSliceTransfer_v1
+{
+    static constexpr index_t nDim =
+        remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension();
+    using Index = MultiIndex<nDim>;
+    __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v1(const BlockSrcDesc& block_src_desc,
+                                                                const Index& src_block_slice_origin,
+                                                                const BlockDstDesc& block_dst_desc,
+                                                                const Index& dst_block_slice_origin)
+        : threadwise_transfer_(block_src_desc,
+                               make_zero_multi_index<nDim>(),
+                               block_dst_desc,
+                               make_zero_multi_index<nDim>())
+    {
+        static_assert(
+            nDim == remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension() &&
+                nDim == remove_reference_t<remove_cv_t<BlockDstDesc>>::GetNumOfDimension() &&
+                nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
+                nDim == ThreadClusterLengths::Size() && nDim == ThreadClusterArrangeOrder::Size() &&
+                nDim == SrcDstDimAccessOrder::Size(),
+            "wrong! nDim not consistent");
+        static_assert(
+            is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+        static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! BlockSize too small");
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_id =
+                thread_cluster_desc_.CalculateClusterIndex(get_thread_local_1d_id());
+            const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};
+            threadwise_transfer_.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
+        }
+    }
+    __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.Run(p_block_src, p_block_dst);
+        }
+    }
+    __device__ void MoveSrcSliceWindow(const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(step);
+        }
+    }
+    __device__ void MoveDstSliceWindow(const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(step);
+        }
+    }
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+    using ThreadwiseTransfer = ThreadwiseDynamicTensorSliceTransfer_v1<BlockSrcDesc,
+                                                                       BlockDstDesc,
+                                                                       ThreadSliceLengths,
+                                                                       SrcDstDimAccessOrder,
+                                                                       SrcDstVectoReadDim,
+                                                                       SrcDataPerRead,
+                                                                       DstDataPerWrite,
+                                                                       SrcAddressSpace,
+                                                                       DstAddressSpace,
+                                                                       DstInMemOp,
+                                                                       SrcDataStride,
+                                                                       DstDataStride>;
+    ThreadwiseTransfer threadwise_transfer_;
+};
 template <index_t BlockSize,
          typename BlockSrcData,
          typename BlockDstData,
@@ -20,7 +130,7 @@ template <index_t BlockSize,
          typename ThreadClusterArrangeOrder,
          typename SrcDimAccessOrder,
          typename DstDimAccessOrder,
-          index_t SrcVectoReadDim,
+          index_t SrcVectorReadDim,
          index_t DstVectorWriteDim,
          index_t SrcDataPerRead,
          index_t DstDataPerWrite,
@@ -29,17 +139,26 @@ template <index_t BlockSize,
          InMemoryDataOperation DstInMemOp,
          index_t SrcDataStride,
          index_t DstDataStride>
-struct BlockwiseDynamicTensorSliceTransfer_v1
+struct BlockwiseDynamicTensorSliceTransfer_v2
 {
    static constexpr index_t nDim =
        remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension();
    using Index = MultiIndex<nDim>;
-    __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v1(const BlockSrcDesc& block_src_desc,
+    __device__ constexpr BlockwiseDynamicTensorSliceTransfer_v2(const BlockSrcDesc& block_src_desc,
                                                                const Index& src_block_slice_origin,
                                                                const BlockDstDesc& block_dst_desc,
                                                                const Index& dst_block_slice_origin)
+        : threadwise_read_(block_src_desc,
+                           make_zero_multi_index<nDim>(),
+                           thread_buffer_desc_,
+                           make_zero_multi_index<nDim>()),
+          threadwise_write_(thread_buffer_desc_,
+                            make_zero_multi_index<nDim>(),
+                            block_dst_desc,
+                            make_zero_multi_index<nDim>())
    {
        static_assert(
            nDim == remove_reference_t<remove_cv_t<BlockSrcDesc>>::GetNumOfDimension() &&
@@ -131,7 +250,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v1
                                                                   decltype(thread_buffer_desc_),
                                                                   ThreadSliceLengths,
                                                                   SrcDimAccessOrder,
-                                                                   SrcVectoReadDim,
+                                                                   SrcVectorReadDim,
                                                                   SrcDataPerRead,
                                                                   1,
                                                                   SrcAddressSpace,

--- a/composable_kernel/include/tensor_operation/gridwise_tensor_contraction.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_tensor_contraction.hpp
-#ifndef CK_GRIDWISE_TENSOR_CONTRACTION_HPP
-#define CK_GRIDWISE_TENSOR_CONTRACTION_HPP
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "ConstantMatrixDescriptor.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-#include "threadwise_generic_tensor_slice_copy.hpp"
-#include "blockwise_gemm.hpp"
-namespace ck {
-template <index_t GridSize,
-          index_t BlockSize,
-          typename Float,
-          typename AccFloat,
-          typename AGlobalDesc,
-          typename BGlobalDesc,
-          typename CGlobalDesc,
-          typename CBlockLengths,
-          index_t KPerBlock,
-          InMemoryDataOperation CGlobalMemoryDataOperation>
-struct GridwiseTensorContraction_v1
-{
-    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() {}
-    __device__ void Run(const Float* __restrict__ p_a_global,
-                        const Float* __restrict__ p_b_global,
-                        Float* __restrict__ p_c_global,
-                        Float* __restrict__ p_shared_block) const
-    {
-        /// \todo sanity-check on AGlobalDesc, BGlboalDesc, CGlobalDesc length consisitency
-        /// \todo santiy-check on CBlockLengtsh
-        constexpr auto True = integral_constant<bool, true>{};
-        constexpr auto a_global_desc = AGlobalDesc{};
-        constexpr auto b_global_desc = BGlobalDesc{};
-        constexpr auto c_global_desc = CGlobalDesc{};
-        constexpr auto K = a_global_desc.GetLengths()[0];
-        // don't do anything if K == 0
-        if(K == 0)
-        {
-            return;
-        }
-        // lds max alignment
-        constexpr index_t max_lds_align = math::lcm(ABlockCopyDstDataPerWrite_M,
-                                                    BBlockCopyDstDataPerWrite_N,
-                                                    ThreadGemmAThreadCopySrcDataPerRead_M,
-                                                    ThreadGemmBThreadCopySrcDataPerRead_N);
-        // divide block work by [M, N]
-        static_assert(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0,
-                      "wrong! cannot divide work evenly among block");
-        constexpr index_t MBlockWork = M / MPerBlock;
-        constexpr index_t NBlockWork = N / NPerBlock;
-        constexpr auto block_work_desc =
-            make_cluster_descriptor(Sequence<MBlockWork, NBlockWork>{});
-        const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
-        const index_t m_block_data_on_global = block_work_id[0] * MPerBlock;
-        const index_t n_block_data_on_global = block_work_id[1] * NPerBlock;
-        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_k_m_block_desc = make_native_tensor_descriptor_aligned(
-            Sequence<KPerBlock, MPerBlock>{}, Number<max_lds_align>{});
-        // A matrix blockwise copy
-        auto a_blockwise_copy =
-            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
-                                               AGlobalDesc,
-                                               decltype(a_block_desc),
-                                               decltype(a_k_m_block_desc.GetLengths()),
-                                               ABlockCopyThreadSliceLengths_K_M,
-                                               ABlockCopyThreadClusterLengths_K_M,
-                                               ABlockCopyThreadClusterArrangeOrder,
-                                               ABlockCopySrcAccessOrder,
-                                               Sequence<0, 1>,
-                                               ABlockCopySrcVectorReadDim,
-                                               1,
-                                               ABlockCopySrcDataPerRead,
-                                               ABlockCopyDstDataPerWrite_M,
-                                               AddressSpace::Global,
-                                               AddressSpace::Vgpr,
-                                               AddressSpace::Lds,
-                                               InMemoryDataOperation::Set>(
-                {0, m_block_data_on_global}, {0, 0});
-        // B matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto b_k_n_block_desc = make_native_tensor_descriptor_aligned(
-            Sequence<KPerBlock, NPerBlock>{}, Number<max_lds_align>{});
-        // B matrix blockwise copy
-        auto b_blockwise_copy =
-            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
-                                               decltype(b_k_n_global_desc),
-                                               decltype(b_k_n_block_desc),
-                                               decltype(b_k_n_block_desc.GetLengths()),
-                                               BBlockCopyThreadSliceLengths_K_N,
-                                               BBlockCopyThreadClusterLengths_K_N,
-                                               BBlockCopyThreadClusterArrangeOrder,
-                                               BBlockCopySrcAccessOrder,
-                                               Sequence<0, 1>,
-                                               BBlockCopySrcVectorReadDim,
-                                               1,
-                                               BBlockCopySrcDataPerRead,
-                                               BBlockCopyDstDataPerWrite_N,
-                                               AddressSpace::Global,
-                                               AddressSpace::Vgpr,
-                                               AddressSpace::Lds,
-                                               InMemoryDataOperation::Set>(
-                {0, n_block_data_on_global}, {0, 0});
-        // GEMM definition
-        //   c_mtx += transpose(a_mtx) * b_mtx
-        //     a_mtx[KPerBlock, MPerBlock] is in LDS
-        //     b_mtx[KPerBlocl, NPerBlock] is in LDS
-        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
-        //       register
-        constexpr auto a_k_m_block_mtx_desc = make_ConstantMatrixDescriptor(a_k_m_block_desc);
-        constexpr auto b_k_n_block_mtx_desc = make_ConstantMatrixDescriptor(b_k_n_block_desc);
-        // sanity check
-        static_assert(MPerBlock % (MPerThread * MLevel0Cluster * MLevel1Cluster) == 0 &&
-                          NPerBlock % (NPerThread * NLevel0Cluster * NLevel1Cluster) == 0,
-                      "wrong!");
-        constexpr index_t GemmMRepeat = MPerBlock / (MPerThread * MLevel0Cluster * MLevel1Cluster);
-        constexpr index_t GemmNRepeat = NPerBlock / (NPerThread * NLevel0Cluster * NLevel1Cluster);
-        // c_thread_mtx definition: this is a mess
-        // TODO:: more elegent way of defining c_thread_mtx
-        constexpr auto c_m0m1_n0n1_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
-            Number<GemmMRepeat * MPerThread>{}, Number<GemmNRepeat * NPerThread>{});
-        const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<
-            BlockSize,
-            decltype(a_k_m_block_mtx_desc),
-            decltype(b_k_n_block_mtx_desc),
-            decltype(c_m0m1_n0n1_thread_mtx_desc),
-            MPerThread,
-            NPerThread,
-            MLevel0Cluster,
-            NLevel0Cluster,
-            MLevel1Cluster,
-            NLevel1Cluster,
-            KPerThread,
-            ThreadGemmAThreadCopySrcDataPerRead_M,
-            ThreadGemmBThreadCopySrcDataPerRead_N>{};
-        // LDS allocation for A and B: be careful of alignment
-        constexpr index_t a_block_space =
-            math::integer_least_multiple(a_k_m_block_desc.GetElementSpace(), max_lds_align);
-        constexpr index_t b_block_space =
-            math::integer_least_multiple(b_k_n_block_desc.GetElementSpace(), max_lds_align);
-        Float* p_a_block_double = p_shared_block;
-        Float* p_b_block_double = p_shared_block + 2 * a_block_space;
-        // register allocation for output
-        AccFloat p_c_thread[c_m0m1_n0n1_thread_mtx_desc.GetElementSpace()];
-        // zero out threadwise output
-        threadwise_matrix_set_zero(c_m0m1_n0n1_thread_mtx_desc, p_c_thread);
-        // LDS double buffer: preload data into LDS
-        {
-            a_blockwise_copy.Run(p_a_global, p_a_block_double);
-            b_blockwise_copy.Run(p_b_global, p_b_block_double);
-        }
-        constexpr auto a_block_slice_copy_steps = Sequence<KPerBlock, 0>{};
-        constexpr auto b_block_slice_copy_steps = Sequence<KPerBlock, 0>{};
-        // LDS double buffer: main body
-        for(index_t k_block_data_begin = 0; k_block_data_begin + 2 * KPerBlock < K;
-            k_block_data_begin += 2 * KPerBlock)
-        {
-#pragma unroll
-            for(index_t iloop = 0; iloop < 2; ++iloop)
-            {
-                const bool even_loop = (iloop % 2 == 0);
-                Float* p_a_block_now =
-                    even_loop ? p_a_block_double : p_a_block_double + a_block_space;
-                Float* p_b_block_now =
-                    even_loop ? p_b_block_double : p_b_block_double + b_block_space;
-                Float* p_a_block_next =
-                    even_loop ? p_a_block_double + a_block_space : p_a_block_double;
-                Float* p_b_block_next =
-                    even_loop ? p_b_block_double + b_block_space : p_b_block_double;
-                Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()];
-                Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()];
-                a_blockwise_copy.MoveSrcSliceWindow(a_block_slice_copy_steps, True);
-                b_blockwise_copy.MoveSrcSliceWindow(b_block_slice_copy_steps, True);
-                __syncthreads();
-                // LDS doubel buffer: load next data from device mem
-                a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer);
-                b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
-                // LDS double buffer: GEMM on current data
-                blockwise_gemm.Run(p_a_block_now, p_b_block_now, p_c_thread);
-                // LDS double buffer: store next data to LDS
-                a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer, p_a_block_next);
-                b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, p_b_block_next);
-            }
-        }
-        // LDS double buffer: tail
-        {
-            constexpr bool has_two_iteration_left = (K % (2 * KPerBlock) == 0);
-            if(has_two_iteration_left) // if has 2 iteration left
-            {
-                Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()];
-                Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()];
-                a_blockwise_copy.MoveSrcSliceWindow(a_block_slice_copy_steps, True);
-                b_blockwise_copy.MoveSrcSliceWindow(b_block_slice_copy_steps, True);
-                __syncthreads();
-                // LDS double buffer: load last data from device mem
-                a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer);
-                b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
-                // LDS double buffer: GEMM on 2nd-last data
-                blockwise_gemm.Run(p_a_block_double, p_b_block_double, p_c_thread);
-                // LDS double buffer: store last data to LDS
-                a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer,
-                                                      p_a_block_double + a_block_space);
-                b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer,
-                                                      p_b_block_double + b_block_space);
-                __syncthreads();
-                // LDS double buffer: GEMM on last data
-                blockwise_gemm.Run(
-                    p_a_block_double + a_block_space, p_b_block_double + b_block_space, p_c_thread);
-            }
-            else // if has 1 iteration left
-            {
-                __syncthreads();
-                // LDS double buffer: GEMM on last data
-                blockwise_gemm.Run(p_a_block_double, p_b_block_double, p_c_thread);
-            }
-        }
-        // input: register to global memory
-        {
-            constexpr index_t M1 = MPerThread * MLevel0Cluster * MLevel1Cluster;
-            constexpr index_t M0 = M / M1;
-            constexpr index_t N1 = NPerThread * NLevel0Cluster * NLevel1Cluster;
-            constexpr index_t N0 = N / N1;
-            // define input tensor descriptor for threadwise copy
-            //     thread input tensor, src of threadwise copy
-            constexpr auto c_m0_m1_n0_n1_thread_desc = make_native_tensor_descriptor_packed(
-                Sequence<GemmMRepeat, MPerThread, GemmNRepeat, NPerThread>{});
-            constexpr auto c_m0_m1_n0_n1_global_desc = transform_tensor_descriptor(
-                c_m_n_global_desc,
-                make_tuple(UnMerge<Sequence<M0, M1>>{}, UnMerge<Sequence<N0, N1>>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
-            // calculate origin of thread input tensor on global memory
-            //     blockwise GEMM c matrix starting index
-            const auto c_thread_mtx_on_block =
-                blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
-            const index_t m_thread_data_on_global =
-                m_block_data_on_global + c_thread_mtx_on_block.row;
-            const index_t n_thread_data_on_global =
-                n_block_data_on_global + c_thread_mtx_on_block.col;
-            ThreadwiseGenericTensorSliceCopy_v4r2<decltype(c_m0_m1_n0_n1_thread_desc),
-                                                  decltype(c_m0_m1_n0_n1_global_desc),
-                                                  decltype(c_m0_m1_n0_n1_thread_desc.GetLengths()),
-                                                  CThreadCopySrcDstAccessOrder,
-                                                  CThreadCopySrcDstVectorReadWriteDim,
-                                                  1,
-                                                  CThreadCopyDstDataPerWrite,
-                                                  AddressSpace::Vgpr,
-                                                  AddressSpace::Global,
-                                                  CGlobalMemoryDataOperation>(
-                {0, 0, 0, 0},
-                {m_thread_data_on_global / M1,
-                 m_thread_data_on_global % M1,
-                 n_thread_data_on_global / N1,
-                 n_thread_data_on_global % N1})
-                .Run(p_c_thread, p_c_global);
-        }
-    }
-    __device__ void Run(const Float* __restrict__ p_a_global,
-                        const Float* __restrict__ p_b_global,
-                        Float* __restrict__ p_c_global) const
-    {
-        constexpr index_t shared_block_size = GetSharedMemoryNumberOfByte() / sizeof(Float);
-        __shared__ Float p_shared_block[shared_block_size];
-        Run(p_a_global, p_b_global, p_c_global, p_shared_block);
-    }
-};
-} // namespace ck
-#endif
--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -58,14 +58,28 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
    bool forward_dim1 = true;
    // hardcoded for 2d loop for now
-#pragma unroll 1
+#pragma unroll
    for(int j0 = 0; j0 < J0; ++j0)
    {
-#pragma unroll 1
+#pragma unroll
        for(int j1 = 0; j1 < J1; ++j1)
        {
            // do work
-            p_dst[dst_coord.GetOffset()] = p_src[src_coord.GetOffset()];
+            transfer_data<SrcData,
+                          1,
+                          SrcAddressSpace,
+                          DstAddressSpace,
+                          DstInMemOp,
+                          SrcScalarStrideInVector,
+                          DstScalarStrideInVector>(
+                p_src,
+                src_coord.GetOffset(),
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord),
+                src_desc.GetElementSpaceSize(),
+                p_dst,
+                dst_coord.GetOffset(),
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord),
+                dst_desc.GetElementSpaceSize());
            // move dim1 iterator
            if(j1 < J1 - 1)

--- a/driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
+++ b/driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
@@ -66,11 +66,11 @@ void device_dynamic_col2im_gemmkgemmn_nchw(ColDesc,
 #if 1
    constexpr index_t BlockSize = 256;
-    constexpr index_t GemmKPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 8;
    constexpr index_t GemmNPerBlock = 128;
-    using BlockCopySubLengths_GemmK_GemmN     = Sequence<8, 8>;
+    using BlockCopySubLengths_GemmK_GemmN     = Sequence<1, 8>;
-    using BlockCopyClusterLengths_GemmK_GemmN = Sequence<16, 16>;
+    using BlockCopyClusterLengths_GemmK_GemmN = Sequence<8, 16>;
    using BlockCopyThreadClusterArrangeOrder  = Sequence<0, 1>; // [GemmK, GemmN]
    using BlockCopySrcAccessOrder             = Sequence<0, 1>; // [GemmK, GemmN]
    using BlockCopyDstAccessOrder             = Sequence<0, 1>; // [GemmK, GemmN]

--- a/driver/include/host_col2im.hpp
+++ b/driver/include/host_col2im.hpp
@@ -21,14 +21,14 @@ void host_col2im(const Tensor<T>& in_eb,
    int N  = in_nchw.mDesc.GetLengths()[0];
    int C  = in_nchw.mDesc.GetLengths()[1];
-    int HI = in_nchw.mDesc.GetLengths()[2];
+    int Hi = in_nchw.mDesc.GetLengths()[2];
-    int WI = in_nchw.mDesc.GetLengths()[3];
+    int Wi = in_nchw.mDesc.GetLengths()[3];
    int Y = FilterSizes{}[0];
    int X = FilterSizes{}[1];
-    int HO = OutputSizes{}[0];
+    int Ho = OutputSizes{}[0];
-    int WO = OutputSizes{}[1];
+    int Wo = OutputSizes{}[1];
    auto f = [&](auto n, auto c, auto hi, auto wi) {
        double v = 0;
@@ -37,22 +37,28 @@ void host_col2im(const Tensor<T>& in_eb,
        {
            int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
-            if(h_tmp >= 0 && h_tmp < HI && h_tmp % ConvStrides{}[0] == 0)
+            if(h_tmp % ConvStrides{}[0] == 0)
            {
                int ho = h_tmp / ConvStrides{}[0];
-                for(int x = 0; x < X; ++x)
+                if(ho >= 0 && ho < Ho)
                {
-                    int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
+                    for(int x = 0; x < X; ++x)
-                    if(w_tmp >= 0 && w_tmp < WI && w_tmp % ConvStrides{}[1] == 0)
                    {
-                        int wo = w_tmp / ConvStrides{}[1];
+                        int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
+                        if(w_tmp % ConvStrides{}[1] == 0)
+                        {
+                            int wo = w_tmp / ConvStrides{}[1];
-                        int e = c * (Y * X) + y * X + x;
+                            if(wo >= 0 && wo < Wo && w_tmp % ConvStrides{}[1] == 0)
-                        int b = n * (HO * WO) + ho * WO + wo;
+                            {
+                                int e = c * (Y * X) + y * X + x;
+                                int b = n * (Ho * Wo) + ho * Wo + wo;
-                        v += in_eb(e, b);
+                                v += in_eb(e, b);
+                            }
+                        }
                    }
                }
            }

--- a/driver/src/col2im_driver.cpp
+++ b/driver/src/col2im_driver.cpp
@@ -19,21 +19,6 @@ int main(int argc, char* argv[])
    using namespace ck;
 #if 1
-    // 1x1, 8x8
-    constexpr index_t N  = 2;
-    constexpr index_t C  = 128;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
    // 3x3, 71x71
    constexpr index_t N  = 128;
    constexpr index_t C  = 192;
@@ -541,7 +526,7 @@ int main(int argc, char* argv[])
    if(do_verification)
    {
-#if 1
+#if 0
        col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #else
        col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
@@ -587,7 +572,7 @@ int main(int argc, char* argv[])
        check_error(img_nchw_host, img_nchw_device);
-#if 1
+#if 0
        LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
        LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
        LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;

--- a/driver/src/conv_bwd_data_driver.cpp
+++ b/driver/src/conv_bwd_data_driver.cpp
@@ -250,17 +250,17 @@ int main(int argc, char* argv[])
 #elif 1
    device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
 #endif
-    (in_nchw_desc,
+        (in_nchw_desc,
-     in_nchw_device,
+         in_nchw_device,
-     wei_kcyx_desc,
+         wei_kcyx_desc,
-     wei_kcyx,
+         wei_kcyx,
-     out_nkhw_desc,
+         out_nkhw_desc,
-     out_nkhw,
+         out_nkhw,
-     ConvStrides{},
+         ConvStrides{},
-     ConvDilations{},
+         ConvDilations{},
-     LeftPads{},
+         LeftPads{},
-     RightPads{},
+         RightPads{},
-     nrepeat);
+         nrepeat);
    if(do_verification)
    {