updating v5r1

415a4a5b · Chao Liu · 32d485dd · 415a4a5b · 415a4a5b · 415a4a5b
Commit 415a4a5b authored Apr 23, 2021 by Chao Liu
9 changed files
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_v2.hpp
@@ -7,6 +7,7 @@

 namespace ck {

+#if 0
 // blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
 // A and B are visable to the whole block, C is distributed among each thread
 // If following number are power of 2, index calculation shall be greatly reduced:
@@ -364,12 +365,20 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v1
 #endif
    }
 };
+#endif

-// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
+// C[M, N] += transpose(A[K, M]) * B[K, N]
 // A and B are visable to the whole block, C is distributed among each thread
-// If following number are power of 2, index calculation shall be greatly reduced:
-//    MPerThreadSubC, NPerThreadSubC, MLevel0ThreadCluster, NLevel0ThreadCluster,
-//    MLevel1ThreadCluster, NLevel1ThreadCluster
+// Assume:
+//   1. A:
+//     1. BlockMatrixA is known at compile-time
+//     2. ABlockBuffer is DynamicBuffer
+//   2. B:
+//     1. BlockMatrixA is known at compile-time
+//     2. BBlockBuffer is DynamicBuffer
+//   3. C:
+//     1. ThreadMatrixC is known at compile-time
+//     2. CThreadBuffer is StaticBuffer
 template <index_t BlockSize,
          typename FloatA,
          typename FloatB,

--- a/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
@@ -6,6 +6,7 @@

 namespace ck {

+#if 0
 // blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
 // A and B are visable to the whole block, C is distributed among each thread
 // If following number are power of 2, index calculation shall be greatly reduced:
@@ -199,6 +200,181 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
        Run_naive(p_a_block, p_b_thread, p_c_thread);
    }
 };
+#else
+// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
+// A and B are visable to the whole block, C is distributed among each thread
+// If following number are power of 2, index calculation shall be greatly reduced:
+//    KPerThread, HPerThread, MLevel0ThreadCluster, NLevel0ThreadCluster,
+//    MLevel1ThreadCluster, NLevel1ThreadCluster
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename BlockMatrixA,
+          typename BlockMatrixB,
+          typename ThreadMatrixC,
+          index_t KPerThread,
+          index_t HPerThread,
+          index_t WPerThread,
+          index_t EPerThreadLoop,
+          index_t ThreadGemmADataPerRead_K,
+          index_t ThreadGemmBDataPerRead_W>
+struct BlockwiseGemm_km_kn_m0m1n0n1_v3
+{
+    struct MatrixIndex
+    {
+        index_t k;
+        index_t h;
+        index_t w;
+    };
+
+    MatrixIndex c_thread_begin_mtx_idx_;
+
+    // HACK: fix this @Jing Zhang
+    static constexpr index_t KPerThreadSubC = 4;
+
+    static constexpr auto a_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadSubC>{}));
+
+    static constexpr auto b_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+        Number<EPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
+
+    static constexpr auto c_thread_mtx_ = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+        Number<KPerThreadSubC>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
+
+    using AThreadCopy =
+        ThreadwiseDynamicTensorSliceTransfer_v4<FloatA,
+                                                FloatA,
+                                                BlockMatrixA,
+                                                decltype(a_thread_mtx_),
+                                                Sequence<EPerThreadLoop, KPerThreadSubC>,
+                                                Sequence<0, 1>,
+                                                1,
+                                                ThreadGemmADataPerRead_K,
+                                                AddressSpace::Generic,
+                                                AddressSpace::Vgpr,
+                                                1>;
+
+    AThreadCopy a_thread_copy_;
+
+    __device__ BlockwiseGemm_km_kn_m0m1n0n1_v3()
+        : c_thread_begin_mtx_idx_{GetBeginOfThreadMatrixC(get_thread_local_1d_id())},
+          a_thread_copy_{make_tuple(0, c_thread_begin_mtx_idx_.k * KPerThread)}
+    {
+        static_assert(BlockMatrixA::IsKnownAtCompileTime() &&
+                          BlockMatrixB::IsKnownAtCompileTime() &&
+                          ThreadMatrixC::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
+                      "wrong! K dimension not consistent\n");
+
+        constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed
+        constexpr index_t N = BlockMatrixB{}.GetLength(I1);
+        constexpr index_t H = BlockMatrixB{}.GetLength(I2);
+        constexpr index_t W = BlockMatrixB{}.GetLength(I3);
+
+        static_assert(K % KPerThread == 0 && H % HPerThread == 0 && W % WPerThread == 0,
+                      "wrong! Cannot evenly divide work among\n");
+
+        constexpr auto KThreadCluster = K / KPerThread;
+        constexpr auto HThreadCluster = H / HPerThread;
+        constexpr auto WThreadCluster = W / WPerThread;
+
+        static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster,
+                      "wrong! wrong blocksize\n");
+    }
+
+    __device__ static constexpr auto GetThreadMatrixCLengths()
+    {
+        return Sequence<KPerThread, 1, HPerThread, WPerThread>{};
+    }
+
+    __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
+    {
+        constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{});
+        constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{});
+
+        constexpr auto num_w_threads  = W / WPerThread;
+        constexpr auto num_h_threads  = H / HPerThread;
+        constexpr auto num_hw_threads = num_w_threads * num_h_threads;
+
+        index_t k_thread_id  = thread_id / num_hw_threads;
+        index_t hw_thread_id = thread_id % num_hw_threads;
+
+        index_t h_thread_id = hw_thread_id / num_w_threads;
+        index_t w_thread_id = hw_thread_id % num_w_threads;
+
+        return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
+    }
+
+    __device__ void Run(const FloatA* p_a_block, const FloatB* p_b_thread, FloatC* p_c_thread) const
+    {
+        auto a_block_buf = make_dynamic_buffer(p_a_block);
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto a_block_mtx = BlockMatrixA{};
+
+        constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
+
+        // HACK: fix this @Jing Zhang
+        constexpr auto HoPerThreadSubC = 2;
+        constexpr auto WoPerThreadSubC = 2;
+
+        static_assert(KPerThread % KPerThreadSubC == 0, "");
+        static_assert(HPerThread % HoPerThreadSubC == 0, "");
+        static_assert(WPerThread % WoPerThreadSubC == 0, "");
+
+        // thread A, B for GEMM
+        FloatA p_a_thread[a_thread_mtx_.GetElementSpaceSize()];
+
+        auto a_thread_buf = make_dynamic_buffer(p_a_thread);
+        auto b_thread_buf = make_dynamic_buffer(p_b_thread);
+        auto c_thread_buf = make_dynamic_buffer(p_c_thread);
+
+        constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v3<FloatA,
+                                                                    FloatB,
+                                                                    FloatC,
+                                                                    decltype(a_thread_mtx_),
+                                                                    decltype(b_thread_mtx_),
+                                                                    decltype(c_thread_mtx_),
+                                                                    HoPerThreadSubC,
+                                                                    WoPerThreadSubC>{};
+
+        static_for<0, EPerBlock, EPerThreadLoop>{}([&](auto e_begin) {
+            static_for<0, KPerThread, KPerThreadSubC>{}([&](auto k_begin) {
+
+                a_thread_copy_.Run(a_block_mtx,
+                                   make_tuple(e_begin, k_begin),
+                                   a_block_buf,
+                                   a_thread_mtx_,
+                                   make_tuple(I0, I0),
+                                   a_thread_buf);
+
+                static_for<0, HPerThread, HoPerThreadSubC>{}([&](auto h_begin) {
+                    static_for<0, WPerThread, WoPerThreadSubC>{}([&](auto w_begin) {
+                        threadwise_gemm.Run(a_thread_buf,
+                                            make_tuple(I0, I0),
+                                            b_thread_buf,
+                                            make_tuple(e_begin, I0, h_begin, w_begin),
+                                            c_thread_buf,
+                                            make_tuple(k_begin, I0, h_begin, w_begin));
+                    });
+                });
+            });
+        });
+    }
+};
+#endif

 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
@@ -11,6 +11,7 @@

 namespace ck {

+#if 0
 template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
@@ -462,6 +463,484 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
            integral_constant<bool, HasDoubleTailKBlockLoop>{});
    }
 };
+#else
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperation CGlobalMemoryDataOperation,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t EPerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGlobalIteratorHacks,
+          typename BGlobalIteratorHacks,
+          typename CGlobalIteratorHacks,
+          typename AGlobalMoveSliceWindowIteratorHacks,
+          typename BGlobalMoveSliceWindowIteratorHacks>
+struct GridwiseDynamicGemm_km_kn_mn_v3
+{
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto E = EPerBlock * 3 * 3;
+
+        constexpr auto max_lds_align =
+            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_e_k_desc.GetElementSpaceSize(), max_lds_align);
+
+        return a_block_space_size * sizeof(FloatAB);
+    }
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc& a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc& b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc& c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        FloatAB* __restrict__ p_shared_block,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto E = EPerBlock * 3 * 3;
+
+        // const auto E = a_e_k_global_desc.GetLength(I0);
+        const auto K = a_e_k_global_desc.GetLength(I1);
+
+        const auto N  = b_e_n_ho_wo_global_desc.GetLength(I1);
+        const auto Ho = b_e_n_ho_wo_global_desc.GetLength(I2);
+        const auto Wo = b_e_n_ho_wo_global_desc.GetLength(I3);
+
+        // divide block work by [M, N]
+#if 0
+        const auto k_block_work_num   = K / Number<KPerBlock>{};
+        const auto ho_block_work_num  = Ho / Number<HoPerBlock>{};
+        const auto wo_block_work_num  = Wo / Number<WoPerBlock>{};
+        const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
+
+        const index_t k_block_work_id   = get_block_1d_id() / hwo_block_work_num;
+        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
+
+        const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
+        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
+#else
+        // Hack: this force result into SGPR
+        const index_t k_block_work_num   = __builtin_amdgcn_readfirstlane(K / KPerBlock);
+        const index_t ho_block_work_num  = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
+        const index_t wo_block_work_num  = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
+        const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
+
+        const index_t k_block_work_id =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / hwo_block_work_num);
+        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
+
+        const index_t ho_block_work_id =
+            __builtin_amdgcn_readfirstlane(hwo_block_work_id / wo_block_work_num);
+        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
+#endif
+
+        // lds max alignment
+        constexpr auto max_lds_align =
+            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e_k_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
+
+        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_e_n_ho_wo_block_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                Number<EPerBlock>{}, Number<1>{}, Number<HoPerBlock>{}, Number<WoPerBlock>{}));
+
+        // c_thread_mtx definition: this is a mess
+        // TODO:: more elegent way of defining c_thread_mtx
+        constexpr auto c_k_n_ho_wo_thread_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+        const auto blockwise_gemm =
+            BlockwiseGemm_km_kn_m0m1n0n1_v3<BlockSize,
+                                            FloatAB,
+                                            FloatAB,
+                                            FloatAcc,
+                                            decltype(a_e_k_block_desc),
+                                            decltype(b_e_n_ho_wo_block_desc),
+                                            decltype(c_k_n_ho_wo_thread_desc),
+                                            KPerThread,
+                                            HoPerThread,
+                                            WoPerThread,
+                                            EPerThread,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_K>{};
+
+        auto c_thread_mtx_index = blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
+
+        const auto k_thread_id  = c_thread_mtx_index.k;
+        const auto ho_thread_id = c_thread_mtx_index.h;
+        const auto wo_thread_id = c_thread_mtx_index.w;
+
+        const index_t k_block_data_on_global  = k_block_work_id * KPerBlock;
+        const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
+        const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
+
+        const index_t ho_thread_data_on_global =
+            ho_block_data_on_global + ho_thread_id * HoPerThread;
+        const index_t wo_thread_data_on_global =
+            wo_block_data_on_global + wo_thread_id * WoPerThread;
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperation::Set,
+                                                   Sequence<E, KPerBlock>,
+                                                   ABlockTransferThreadSliceLengths_E_K,
+                                                   ABlockTransferThreadClusterLengths_E_K,
+                                                   ABlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(a_e_k_global_desc),
+                                                   decltype(a_e_k_desc),
+                                                   ABlockTransferSrcAccessOrder,
+                                                   Sequence<0, 1>,
+                                                   ABlockTransferSrcVectorDim,
+                                                   1,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   ABlockTransferDstScalarPerVector_K,
+                                                   AddressSpace::Global,
+                                                   AddressSpace::Lds,
+                                                   1,
+                                                   1,
+                                                   AThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                a_e_k_global_desc,
+                make_multi_index(0, k_block_data_on_global),
+                a_e_k_desc,
+                make_multi_index(0, 0));
+
+        constexpr auto b_e_n_ho_wo_thread_desc =
+            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                Number<EPerBlock>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+        auto b_threadwise_transfer = ThreadwiseDynamicTensorSliceTransfer_v2<
+            FloatAB,
+            FloatAB,
+            decltype(b_e_n_ho_wo_global_desc),
+            decltype(b_e_n_ho_wo_thread_desc),
+            Sequence<EPerBlock, 1, HoPerThread, WoPerThread>,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorDim,
+            BBlockTransferSrcScalarPerVector,
+            AddressSpace::Global,
+            AddressSpace::Vgpr,
+            InMemoryDataOperation::Set,
+            1,
+            true>(b_e_n_ho_wo_global_desc,
+                  make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global));
+
+        FloatAB* p_a_block = p_shared_block;
+
+#if 0
+        // register allocation for output
+        FloatAcc p_c_thread[c_k_n_ho_wo_thread_desc.GetElementSpaceSize()];
+
+        // zero out threadwise output
+        threadwise_matrix_set_zero_v3(c_k_n_ho_wo_thread_desc, p_c_thread);
+#else
+        // register allocation for output
+        FloatAcc p_c_thread[c_k_n_ho_wo_thread_desc.GetElementSpaceSize()];
+
+        auto c_thread_buf = make_dynamic_buffer(p_c_thread);
+
+        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
+                                           decltype(c_k_n_ho_wo_thread_desc),
+                                           Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
+            .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
+
+#endif
+
+        constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_e_k_global_iterator_hacks       = AGlobalIteratorHacks{};
+        constexpr auto b_e_n_ho_wo_global_iterator_hacks = BGlobalIteratorHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_e_k_global_move_slice_window_iterator_hack =
+            AGlobalMoveSliceWindowIteratorHacks{};
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
+            BGlobalMoveSliceWindowIteratorHacks{};
+
+#if 0
+        constexpr auto b_thread_space_size = b_e_n_ho_wo_thread_desc.GetElementSpaceSize();
+        FloatAB p_b_thread[b_thread_space_size * 2];
+
+        FloatAB* p_b_thread_double = p_b_thread;
+#elif 1
+        constexpr auto b_thread_space_size = b_e_n_ho_wo_thread_desc.GetElementSpaceSize();
+        FloatAB p_b_thread[b_thread_space_size * 2];
+
+        auto b_thread_even_buf = make_dynamic_buffer(p_b_thread);
+        auto b_thread_odd_buf  = make_dynamic_buffer(p_b_thread + b_thread_space_size);
+#else
+        StaticBuffer<FloatAB, b_e_n_ho_wo_thread_desc.GetElementSpaceSize()> a_thread_even_buf,
+            a_thread_odd_buf;
+#endif
+
+        // LDS double buffer: preload data into LDS
+        {
+            a_blockwise_copy.RunRead(a_e_k_global_desc, p_a_global, a_e_k_global_iterator_hacks);
+
+            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                      p_b_global,
+                                      b_e_n_ho_wo_thread_desc,
+                                      make_tuple(I0, I0, I0, I0),
+                                      b_thread_even_buf.p_data_,
+                                      b_e_n_ho_wo_global_iterator_hacks);
+
+            a_blockwise_copy.RunWrite(a_e_k_desc, p_a_block);
+        }
+
+        __syncthreads();
+
+        index_t b_block_data_begin = 0;
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                         b_thread_slice_copy_step);
+
+                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                          p_b_global,
+                                          b_e_n_ho_wo_thread_desc,
+                                          make_tuple(I0, I0, I0, I0),
+                                          b_thread_odd_buf.p_data_,
+                                          b_e_n_ho_wo_global_iterator_hacks);
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(
+                    p_a_block + a_e_k_block_desc.CalculateOffset(make_tuple(b_block_data_begin, 0)),
+                    b_thread_even_buf.p_data_,
+                    p_c_thread);
+
+                b_block_data_begin += EPerBlock;
+
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                         b_thread_slice_copy_step);
+
+                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                          p_b_global,
+                                          b_e_n_ho_wo_thread_desc,
+                                          make_tuple(I0, I0, I0, I0),
+                                          b_thread_even_buf.p_data_,
+                                          b_e_n_ho_wo_global_iterator_hacks);
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(
+                    p_a_block + a_e_k_block_desc.CalculateOffset(make_tuple(b_block_data_begin, 0)),
+                    b_thread_odd_buf.p_data_,
+                    p_c_thread);
+
+                b_block_data_begin += EPerBlock;
+
+            } while(b_block_data_begin < E - 2 * EPerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                     b_thread_slice_copy_step);
+
+            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                      p_b_global,
+                                      b_e_n_ho_wo_thread_desc,
+                                      make_tuple(I0, I0, I0, I0),
+                                      b_thread_odd_buf.p_data_,
+                                      b_e_n_ho_wo_global_iterator_hacks);
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(
+                p_a_block + a_e_k_block_desc.CalculateOffset(make_tuple(b_block_data_begin, 0)),
+                b_thread_even_buf.p_data_,
+                p_c_thread);
+
+            b_block_data_begin += EPerBlock;
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                p_a_block + a_e_k_block_desc.CalculateOffset(make_tuple(b_block_data_begin, 0)),
+                b_thread_odd_buf.p_data_,
+                p_c_thread);
+        }
+        else // if has 1 iteration left
+        {
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                p_a_block + a_e_k_block_desc.CalculateOffset(make_tuple(b_block_data_begin, 0)),
+                b_thread_even_buf.p_data_,
+                p_c_thread);
+        }
+
+        // output: register to global memory
+        {
+            // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
+            constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
+
+            const index_t k_thread_data_on_global =
+                k_block_data_on_global + k_thread_id * KPerThread;
+
+            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_k_n_ho_wo_thread_desc),
+                decltype(c_k_n_ho_wo_global_desc),
+                Sequence<KPerThread, 1, HoPerThread, WoPerThread>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                AddressSpace::Vgpr,
+                AddressSpace::Global,
+                CGlobalMemoryDataOperation,
+                1,
+                true>(
+                c_k_n_ho_wo_global_desc,
+                make_multi_index(
+                    k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global))
+                .Run(c_k_n_ho_wo_thread_desc,
+                     make_tuple(I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_k_n_ho_wo_global_desc,
+                     p_c_global,
+                     c_k_n_ho_wo_global_tensor_iterator_hacks);
+        }
+    }
+
+    // pass tensor descriptor by reference
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc& a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc& b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc& c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        constexpr index_t shared_block_size = GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+        __shared__ FloatAB p_shared_block[shared_block_size];
+
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            p_shared_block,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+
+    // pass tensor descriptors by their pointers
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc* p_a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc* p_b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc* p_c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        const auto a_e_k_global_desc       = *p_a_e_k_global_desc;
+        const auto b_e_n_ho_wo_global_desc = *p_b_e_n_ho_wo_global_desc;
+        const auto c_k_n_ho_wo_global_desc = *p_c_k_n_ho_wo_global_desc;
+
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+
+    // pass tensor descriptors by void*
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const void* p_a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const void* p_b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const void* p_c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        const auto a_e_k_global_desc = *reinterpret_cast<const AGlobalDesc*>(p_a_e_k_global_desc);
+        const auto b_e_n_ho_wo_global_desc =
+            *reinterpret_cast<const BGlobalDesc*>(p_b_e_n_ho_wo_global_desc);
+        const auto c_k_n_ho_wo_global_desc =
+            *reinterpret_cast<const CGlobalDesc*>(p_c_k_n_ho_wo_global_desc);
+
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+};
+
+#endif

 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_set.hpp
@@ -28,7 +28,10 @@ struct ThreadwiseDynamicTensorSliceSet_v1
        static_assert(Desc::IsKnownAtCompileTime(),
                      "wrong! SrcDesc and DstDesc need to known at compile-time");

+#if 0
+        // TODO: turn this on when v5r1 is update to get rid of array
        static_assert(Buffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+#endif

        static_assert(is_known_at_compile_time<remove_cv_t<remove_reference_t<OriginIdx>>>::value,
                      "wrong! OriginIdx need to be known at compile-time");

--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -1399,7 +1399,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
                                  remove_cv_t<remove_reference_t<DstData>>>::value,
                      "wrong! SrcBuffer or DstBuffer data type is wrong");

+#if 0
+        // turn this on after v5r1 is updated
        static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+#endif

        static_assert(is_known_at_compile_time<
                          remove_cv_t<remove_reference_t<SrcRefToOriginDisplacement>>>::value &&
@@ -1413,8 +1416,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
        constexpr auto dst_desc = remove_cv_t<remove_reference_t<DstDesc>>{};

        // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
-        constexpr auto src_ref_to_origin_disp_idx = SrcRefToOriginDisplacement{};
-        constexpr auto dst_ref_to_origin_disp_idx = DstRefToOriginDisplacement{};
+        constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
+        constexpr auto dst_ref_to_origin_disp_idx = to_multi_index(DstRefToOriginDisplacement{});

        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -1469,7 +1472,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4

            // src coordinate
            constexpr auto src_ref_to_data_disp_idx =
-                to_multi_index(src_ref_to_origin_disp_idx + data_to_origin_disp_idx);
+                src_ref_to_origin_disp_idx + data_to_origin_disp_idx;

            constexpr auto src_ref_to_data_disp_coord_iterator =
                make_dynamic_tensor_coordinate_iterator(src_desc, src_ref_to_data_disp_idx);

--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
@@ -6,7 +6,7 @@

 namespace ck {

-#if 1
+#if 0
 template <typename Float, typename Desc>
 __device__ void threadwise_matrix_set_zero_v2(Desc, Float* __restrict__ p_thread)
 {
@@ -174,8 +174,7 @@ struct ThreadwiseGemm_km_kn_mn_v1
 //   Element of matrix can be vectorized data
 // Assume:
 //   1. ADesc, BDesc, CDesc are known at compile-time
-//   2. ABuffer, BBuffer, CBuffer are static buffer
-//   3. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
+//   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
 template <typename FloatA,
          typename FloatB,
          typename FloatC,

--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
@@ -6,6 +6,7 @@

 namespace ck {

+#if 0
 template <typename Float, typename Desc>
 __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread)
 {
@@ -31,10 +32,15 @@ __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread
        });
    });
 }
+#endif

+#if 0
 // C[M, N] += transpose(A[K, M]) * B[K, N]
 //   Element of matrix can be vectorized data
-template <typename ADesc,
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ADesc,
          typename BDesc,
          typename CDesc,
          index_t H,
@@ -44,8 +50,7 @@ template <typename ADesc,
                                  bool>::type = false>
 struct ThreadwiseGemm_km_kn_mn_v3
 {
-    template <typename FloatA, typename FloatB, typename FloatC>
-    __device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+    __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
    {
        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
                          CDesc::IsKnownAtCompileTime(),
@@ -132,13 +137,153 @@ struct ThreadwiseGemm_km_kn_mn_v3
            });
        });
    }
+};
+#else
+// C[M, N] += transpose(A[K, M]) * B[K, N]
+//   Element of matrix can be vectorized data
+// Assume:
+//   1. ADesc, BDesc, CDesc are known at compile-time
+//   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ADesc,
+          typename BDesc,
+          typename CDesc,
+          index_t H,
+          index_t W,
+          typename std::enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
+                                      CDesc::IsKnownAtCompileTime(),
+                                  bool>::type = false>
+struct ThreadwiseGemm_km_kn_mn_v3
+{
+    template <typename ABuffer,
+              typename AOriginIdx,
+              typename BBuffer,
+              typename BOriginIdx,
+              typename CBuffer,
+              typename COriginIdx>
+    __device__ static void Run(const ABuffer& a_buf,
+                               AOriginIdx,
+                               const BBuffer& b_buf,
+                               BOriginIdx,
+                               CBuffer& c_buf,
+                               COriginIdx)
+    {
+        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
+                          CDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");

-    template <typename FloatA, typename FloatB, typename FloatC>
-    __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
+        static_assert(
+            is_known_at_compile_time<remove_cv_t<remove_reference_t<AOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<BOriginIdx>>>::value &&
+                is_known_at_compile_time<remove_cv_t<remove_reference_t<COriginIdx>>>::value,
+            "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto E = ADesc{}.GetLength(I0);
+        constexpr auto K = ADesc{}.GetLength(I1);
+
+        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
+        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
+        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
+
+        static_for<0, E, 1>{}([&](auto e) {
+            static_for<0, K, 1>{}([&](auto k) {
+                constexpr index_t a_offset =
+                    ADesc{}.CalculateOffset(a_origin_idx + make_tuple(e, k));
+
+                if constexpr(H == 2 && W == 2)
+                {
+                    constexpr index_t b_offset_0 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
+                    constexpr index_t b_offset_1 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 1));
+                    constexpr index_t b_offset_2 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
+                    constexpr index_t b_offset_3 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 1));
+
+                    constexpr index_t c_offset_0 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
+                    constexpr index_t c_offset_1 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 1));
+                    constexpr index_t c_offset_2 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
+                    constexpr index_t c_offset_3 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 1));
+
+                    amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
+                                                   b_buf[Number<b_offset_0>{}],
+                                                   b_buf[Number<b_offset_1>{}],
+                                                   b_buf[Number<b_offset_2>{}],
+                                                   b_buf[Number<b_offset_3>{}],
+                                                   c_buf(Number<c_offset_0>{}),
+                                                   c_buf(Number<c_offset_1>{}),
+                                                   c_buf(Number<c_offset_2>{}),
+                                                   c_buf(Number<c_offset_3>{}));
+                }
+                else if constexpr(H == 4 && W == 1)
                {
-        Run_source(p_a, p_b, p_c);
+                    constexpr index_t b_offset_0 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
+                    constexpr index_t b_offset_1 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
+                    constexpr index_t b_offset_2 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 2, 0));
+                    constexpr index_t b_offset_3 =
+                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 3, 0));
+
+                    constexpr index_t c_offset_0 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
+                    constexpr index_t c_offset_1 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
+                    constexpr index_t c_offset_2 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 2, 0));
+                    constexpr index_t c_offset_3 =
+                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 3, 0));
+
+                    amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
+                                                   b_buf[Number<b_offset_0>{}],
+                                                   b_buf[Number<b_offset_1>{}],
+                                                   b_buf[Number<b_offset_2>{}],
+                                                   b_buf[Number<b_offset_3>{}],
+                                                   c_buf(Number<c_offset_0>{}),
+                                                   c_buf(Number<c_offset_1>{}),
+                                                   c_buf(Number<c_offset_2>{}),
+                                                   c_buf(Number<c_offset_3>{}));
+                }
+                else
+                {
+                    static_for<0, H, 1>{}([&](auto h) {
+                        static_for<0, W, 1>{}([&](auto w) {
+
+                            constexpr index_t b_offset =
+                                BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, h, w));
+
+                            constexpr index_t c_offset =
+                                CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, h, w));
+
+#if 0
+                            c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(a_buf[Number<a_offset>{}],
+                                                                                               b_buf[Number<b_offset>{}]);
+#else
+                            amd_assembly_inner_product(a_buf[Number<a_offset>{}],
+                                                       b_buf[Number<b_offset>{}],
+                                                       c_buf(Number<c_offset>{}));
+#endif
+                        });
+                    });
+                }
+            });
+        });
    }
 };
+#endif

 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -14,7 +14,7 @@
 #define CK_DEVICE_BACKEND_AMD 1

 // GPU ID
-#if 1
+#if 0
 #define CK_AMD_GPU_GFX906 1
 #elif 0
 #define CK_AMD_GPU_GFX908 1
@@ -28,7 +28,7 @@
 #endif

 // launch bounds
-#define CK_USE_LAUNCH_BOUNDS 0
+#define CK_USE_LAUNCH_BOUNDS 1

 #ifdef CK_USE_LAUNCH_BOUNDS
 #define CK_MAX_THREAD_PER_BLOCK 256

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -64,7 +64,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
    constexpr index_t N  = 1;
    constexpr index_t C  = 16;
    constexpr index_t HI = 1080;
@@ -630,7 +630,7 @@ int main(int argc, char* argv[])
    print_array("ConvStrides", to_multi_index(ConvStrides{}));
    print_array("ConvDilations", to_multi_index(ConvDilations{}));

-#if 1
+#if 0
    using in_data_t                  = float;
    constexpr index_t in_vector_size = 1;
    using acc_data_t                 = float;
@@ -724,7 +724,7 @@ int main(int argc, char* argv[])
                                                                 LeftPads{},
                                                                 RightPads{},
                                                                 nrepeat);
-#elif 1
+#elif 0
    device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw<in_data_t,
                                                                         in_vector_size,
                                                                         acc_data_t,