clean up

08bb4372 · Chao Liu · 905f5a3f · 08bb4372 · 08bb4372 · 08bb4372
Commit 08bb4372 authored Apr 23, 2021 by Chao Liu
5 changed files
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_v2.hpp
@@ -7,366 +7,6 @@
 namespace ck {
-#if 0
-// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
-// A and B are visable to the whole block, C is distributed among each thread
-// If following number are power of 2, index calculation shall be greatly reduced:
-//    MPerThreadSubC, NPerThreadSubC, MLevel0ThreadCluster, NLevel0ThreadCluster,
-//    MLevel1ThreadCluster, NLevel1ThreadCluster
-template <index_t BlockSize,
-          typename FloatA,
-          typename FloatB,
-          typename FloatC,
-          typename BlockMatrixA,
-          typename BlockMatrixB,
-          typename ThreadMatrixC,
-          index_t MPerThreadSubC,
-          index_t NPerThreadSubC,
-          index_t KPerThreadLoop,
-          index_t MLevel0ThreadCluster,
-          index_t NLevel0ThreadCluster,
-          index_t MLevel1ThreadCluster,
-          index_t NLevel1ThreadCluster,
-          index_t ThreadGemmADataPerRead_M,
-          index_t ThreadGemmBDataPerRead_N>
-struct BlockwiseGemm_km_kn_m0m1n0n1_v1
-{
-    struct MatrixIndex
-    {
-        index_t row;
-        index_t col;
-    };
-    index_t mMyThreadOffsetA;
-    index_t mMyThreadOffsetB;
-    __device__ BlockwiseGemm_km_kn_m0m1n0n1_v1()
-    {
-        static_assert(BlockMatrixA::IsKnownAtCompileTime() &&
-                          BlockMatrixB::IsKnownAtCompileTime() &&
-                          ThreadMatrixC::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr index_t ThreadPerLevel1Cluster = MLevel0ThreadCluster * NLevel0ThreadCluster *
-                                                   MLevel1ThreadCluster * NLevel1ThreadCluster;
-        static_assert(BlockSize == ThreadPerLevel1Cluster, "wrong! wrong blocksize\n");
-        static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
-                      "wrong! K dimension not consistent\n");
-        constexpr index_t M = BlockMatrixA{}.GetLength(I1); // A is transposed
-        constexpr index_t N = BlockMatrixB{}.GetLength(I1);
-        static_assert(M % (MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster) == 0 &&
-                          N % (NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster) == 0,
-                      "wrong! Cannot evenly divide work among\n");
-        static_assert(ThreadMatrixC{}.GetLength(I0) == GetThreadMatrixCLengths()[I0] &&
-                          ThreadMatrixC{}.GetLength(I1) == GetThreadMatrixCLengths()[I1],
-                      "wrong! ThreadMatrixC lengths is wrong");
-        auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
-        mMyThreadOffsetA = BlockMatrixA{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.row));
-        mMyThreadOffsetB = BlockMatrixB{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.col));
-    }
-    __device__ static constexpr auto GetThreadMatrixCLengths()
-    {
-        constexpr auto I1 = Number<1>{};
-        constexpr index_t M = BlockMatrixA{}.GetLength(I1); // A is transposed
-        constexpr index_t N = BlockMatrixB{}.GetLength(I1);
-        constexpr index_t MRepeat =
-            M / (MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster);
-        constexpr index_t NRepeat =
-            N / (NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster);
-        return Sequence<MRepeat * MPerThreadSubC, NRepeat * NPerThreadSubC>{};
-    }
-    __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
-    {
-        constexpr index_t ThreadPerLevel0Cluster = MLevel0ThreadCluster * NLevel0ThreadCluster;
-        index_t level1_id   = thread_id / ThreadPerLevel0Cluster;
-        index_t level1_m_id = level1_id / NLevel1ThreadCluster;
-        index_t level1_n_id = level1_id % NLevel1ThreadCluster;
-        index_t level0_id   = thread_id % ThreadPerLevel0Cluster;
-        index_t level0_m_id = level0_id / NLevel0ThreadCluster;
-        index_t level0_n_id = level0_id % NLevel0ThreadCluster;
-        constexpr index_t MPerLevel0Cluster = MPerThreadSubC * MLevel0ThreadCluster;
-        constexpr index_t NPerLevel0Cluster = NPerThreadSubC * NLevel0ThreadCluster;
-        return MatrixIndex{level1_m_id * MPerLevel0Cluster + level0_m_id * MPerThreadSubC,
-                           level1_n_id * NPerLevel0Cluster + level0_n_id * NPerThreadSubC};
-    }
-    __device__ void
-    Run_naive(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto a_block_mtx  = BlockMatrixA{};
-        constexpr auto b_block_mtx  = BlockMatrixB{};
-        constexpr auto c_thread_mtx = ThreadMatrixC{};
-        constexpr auto K = a_block_mtx.GetLength(I0);
-        constexpr auto MPerThread = c_thread_mtx.GetLength(I0);
-        constexpr auto NPerThread = c_thread_mtx.GetLength(I1);
-        constexpr index_t MPerLevel1Cluster =
-            MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
-        constexpr index_t NPerLevel1Cluster =
-            NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
-        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
-        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
-        // thread A, B for GEMM
-        constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
-            make_tuple(Number<KPerThreadLoop>{}, Number<MPerThread>{}));
-        constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
-            make_tuple(Number<KPerThreadLoop>{}, Number<NPerThread>{}));
-        FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()];
-        FloatB p_b_thread[b_thread_mtx.GetElementSpaceSize()];
-        constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixA,
-                                                                    decltype(a_thread_mtx),
-                                                                    KPerThreadLoop,
-                                                                    MPerThreadSubC,
-                                                                    ThreadGemmADataPerRead_M>{};
-        constexpr auto b_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixB,
-                                                                    decltype(b_thread_mtx),
-                                                                    KPerThreadLoop,
-                                                                    NPerThreadSubC,
-                                                                    ThreadGemmBDataPerRead_N>{};
-        constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v1<FloatA,
-                                                                    FloatB,
-                                                                    FloatC,
-                                                                    decltype(a_thread_mtx),
-                                                                    decltype(b_thread_mtx),
-                                                                    decltype(c_thread_mtx)>{};
-        // loop over k
-        static_for<0, K, KPerThreadLoop>{}([&](auto k_begin) {
-            // read A
-            static_for<0, MRepeat, 1>{}([&](auto m_repeat) {
-                a_thread_copy.Run(p_a_block +
-                                      a_block_mtx.CalculateOffset(
-                                          make_tuple(k_begin, m_repeat * MPerLevel1Cluster)) +
-                                      mMyThreadOffsetA,
-                                  p_a_thread + a_thread_mtx.CalculateOffset(
-                                                   make_tuple(0, m_repeat * MPerThreadSubC)));
-            });
-            // read B
-            static_for<0, NRepeat, 1>{}([&](auto n_repeat) {
-                b_thread_copy.Run(p_b_block +
-                                      b_block_mtx.CalculateOffset(
-                                          make_tuple(k_begin, n_repeat * NPerLevel1Cluster)) +
-                                      mMyThreadOffsetB,
-                                  p_b_thread + b_thread_mtx.CalculateOffset(
-                                                   make_tuple(0, n_repeat * NPerThreadSubC)));
-            });
-            // C += A * B
-            threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
-        });
-    }
-    __device__ void
-    Run_pipelined_2x2(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto a_block_mtx  = BlockMatrixA{};
-        constexpr auto b_block_mtx  = BlockMatrixB{};
-        constexpr auto c_thread_mtx = ThreadMatrixC{};
-        constexpr auto K = a_block_mtx.GetLength(I0);
-        constexpr auto MPerThread = c_thread_mtx.GetLength(I0);
-        constexpr auto NPerThread = c_thread_mtx.GetLength(I1);
-        constexpr index_t MPerLevel1Cluster =
-            MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
-        constexpr index_t NPerLevel1Cluster =
-            NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
-        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
-        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
-        static_assert(MRepeat == 2 && NRepeat == 2,
-                      "wrong! inline asm cannot deal with this GEMM config yet");
-        // thread A, B
-        constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
-            make_tuple(Number<KPerThreadLoop>{}, Number<MPerThread>{}));
-        constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
-            make_tuple(Number<KPerThreadLoop>{}, Number<NPerThread>{}));
-        // thread A-sub, B-sub
-        constexpr auto a_thread_sub_mtx = make_dynamic_naive_tensor_descriptor_v2(
-            make_tuple(Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}),
-            make_tuple(Number<MPerThread>{}, Number<1>{}));
-        constexpr auto b_thread_sub_mtx = make_dynamic_naive_tensor_descriptor_v2(
-            make_tuple(Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}),
-            make_tuple(Number<NPerThread>{}, Number<1>{}));
-        constexpr auto c_thread_sub_mtx = make_dynamic_naive_tensor_descriptor_v2(
-            make_tuple(Number<MPerThreadSubC>{}, Number<NPerThreadSubC>{}),
-            make_tuple(Number<NPerThread>{}, Number<1>{}));
-        FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()];
-        FloatB p_b_thread[b_thread_mtx.GetElementSpaceSize()];
-        constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixA,
-                                                                    decltype(a_thread_mtx),
-                                                                    KPerThreadLoop,
-                                                                    MPerThreadSubC,
-                                                                    ThreadGemmADataPerRead_M>{};
-        constexpr auto b_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixB,
-                                                                    decltype(b_thread_mtx),
-                                                                    KPerThreadLoop,
-                                                                    NPerThreadSubC,
-                                                                    ThreadGemmBDataPerRead_N>{};
-        constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v1<FloatA,
-                                                                    FloatB,
-                                                                    FloatC,
-                                                                    decltype(a_thread_sub_mtx),
-                                                                    decltype(b_thread_sub_mtx),
-                                                                    decltype(c_thread_sub_mtx)>{};
-        const FloatA* p_a_block_off = p_a_block + mMyThreadOffsetA;
-        const FloatB* p_b_block_off = p_b_block + mMyThreadOffsetB;
-        // read A_sub_0
-        a_thread_copy.Run(p_a_block_off, p_a_thread);
-        // read B_sub_0
-        b_thread_copy.Run(p_b_block_off, p_b_thread);
-        // read B_sub_1
-        b_thread_copy.Run(p_b_block_off +
-                              b_block_mtx.CalculateOffset(make_tuple(0, NPerLevel1Cluster)),
-                          p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
-        // read A_sub_1
-        a_thread_copy.Run(p_a_block_off +
-                              a_block_mtx.CalculateOffset(make_tuple(0, MPerLevel1Cluster)),
-                          p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)));
-        // C_sub_00 += transpose(A_sub_0) * B_sub_0
-        threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
-        // C_sub_01 += transpose(A_sub_0) * B_sub_1
-        threadwise_gemm.Run(
-            p_a_thread,
-            p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
-            p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
-        // loop over rest of k
-        static_for<KPerThreadLoop, K, KPerThreadLoop>{}([&](auto k) {
-            // read A_sub_0
-            a_thread_copy.Run(p_a_block_off + a_block_mtx.CalculateOffset(make_tuple(k, 0)),
-                              p_a_thread);
-            // C_sub_10 += transpose(A_sub_1) * B_sub_0
-            threadwise_gemm.Run(
-                p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
-                p_b_thread,
-                p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, 0)));
-            // read B_sub_0
-            b_thread_copy.Run(p_b_block_off + b_block_mtx.CalculateOffset(make_tuple(k, 0)),
-                              p_b_thread);
-            // C_sub_11 += transpose(A_sub_1) * B_sub_1
-            threadwise_gemm.Run(
-                p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
-                p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
-                p_c_thread +
-                    c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, NPerThreadSubC)));
-            // read B_sub_1
-            b_thread_copy.Run(
-                p_b_block_off + b_block_mtx.CalculateOffset(make_tuple(k, NPerLevel1Cluster)),
-                p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
-            // read A_sub_1
-            a_thread_copy.Run(
-                p_a_block_off + a_block_mtx.CalculateOffset(make_tuple(k, MPerLevel1Cluster)),
-                p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)));
-            // C_sub_00 += transpose(A_sub_0) * B_sub_0
-            threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
-            // C_sub_01 += transpose(A_sub_0) * B_sub_1
-            threadwise_gemm.Run(
-                p_a_thread,
-                p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
-                p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
-        });
-        // C_sub_10 += transpose(A_sub_1) * B_sub_0
-        threadwise_gemm.Run(
-            p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
-            p_b_thread,
-            p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, 0)));
-        // C_sub_11 += transpose(A_sub_1) * B_sub_1
-        threadwise_gemm.Run(
-            p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
-            p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
-            p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, NPerThreadSubC)));
-    }
-    __device__ void Run(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
-    {
-#if CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr index_t MPerThread = ThreadMatrixC{}.GetLength(I0);
-        constexpr index_t NPerThread = ThreadMatrixC{}.GetLength(I1);
-        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
-        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
-        if constexpr(MRepeat == 2 && NRepeat == 2)
-        {
-            Run_pipelined_2x2(p_a_block, p_b_block, p_c_thread);
-        }
-        else
-        {
-            Run_naive(p_a_block, p_b_block, p_c_thread);
-        }
-#else
-        Run_naive(p_a_block, p_b_block, p_c_thread);
-#endif
-    }
-};
-#endif
 // C[M, N] += transpose(A[K, M]) * B[K, N]
 // A and B are visable to the whole block, C is distributed among each thread
 // Assume:

--- a/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
@@ -6,201 +6,6 @@
 namespace ck {
-#if 0
-// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
-// A and B are visable to the whole block, C is distributed among each thread
-// If following number are power of 2, index calculation shall be greatly reduced:
-//    KPerThread, HPerThread, MLevel0ThreadCluster, NLevel0ThreadCluster,
-//    MLevel1ThreadCluster, NLevel1ThreadCluster
-template <index_t BlockSize,
-          typename BlockMatrixA,
-          typename BlockMatrixB,
-          typename ThreadMatrixC,
-          index_t KPerThread,
-          index_t HPerThread,
-          index_t WPerThread,
-          index_t EPerThreadLoop,
-          index_t ThreadGemmADataPerRead_K,
-          index_t ThreadGemmBDataPerRead_W>
-struct BlockwiseGemm_km_kn_m0m1n0n1_v3
-{
-    struct MatrixIndex
-    {
-        index_t k;
-        index_t h;
-        index_t w;
-    };
-    index_t mMyThreadOffsetA;
-    __device__ BlockwiseGemm_km_kn_m0m1n0n1_v3()
-    {
-        static_assert(BlockMatrixA::IsKnownAtCompileTime() &&
-                          BlockMatrixB::IsKnownAtCompileTime() &&
-                          ThreadMatrixC::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
-                      "wrong! K dimension not consistent\n");
-        constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed
-        constexpr index_t N = BlockMatrixB{}.GetLength(I1);
-        constexpr index_t H = BlockMatrixB{}.GetLength(I2);
-        constexpr index_t W = BlockMatrixB{}.GetLength(I3);
-        static_assert(K % KPerThread == 0 && H % HPerThread == 0 && W % WPerThread == 0,
-                      "wrong! Cannot evenly divide work among\n");
-        constexpr auto KThreadCluster = K / KPerThread;
-        constexpr auto HThreadCluster = H / HPerThread;
-        constexpr auto WThreadCluster = W / WPerThread;
-        static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster,
-                      "wrong! wrong blocksize\n");
-        auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
-        mMyThreadOffsetA =
-            BlockMatrixA{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.k * KPerThread));
-    }
-    __device__ static constexpr auto GetThreadMatrixCLengths()
-    {
-        return Sequence<KPerThread, 1, HPerThread, WPerThread>{};
-    }
-    __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
-    {
-        constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{});
-        constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{});
-        constexpr auto num_w_threads  = W / WPerThread;
-        constexpr auto num_h_threads  = H / HPerThread;
-        constexpr auto num_hw_threads = num_w_threads * num_h_threads;
-        index_t k_thread_id  = thread_id / num_hw_threads;
-        index_t hw_thread_id = thread_id % num_hw_threads;
-        index_t h_thread_id = hw_thread_id / num_w_threads;
-        index_t w_thread_id = hw_thread_id % num_w_threads;
-        return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
-    }
-    template <typename SrcDesc,
-              typename DstDesc,
-              index_t NSliceRow,
-              index_t NSliceCol,
-              index_t DataPerAccess>
-    struct ThreadwiseSliceCopy_a
-    {
-        template <typename Data>
-        __device__ static void Run(const Data* p_src, Data* p_dst)
-        {
-            static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
-                          "wrong! Desc should be known at compile-time");
-            using vector_t = typename vector_type_maker<Data, DataPerAccess>::type::type;
-            static_for<0, NSliceRow, 1>{}([&](auto i) {
-                static_for<0, NSliceCol, DataPerAccess>{}([&](auto j) {
-                    constexpr auto src_offset = SrcDesc{}.CalculateOffset(make_tuple(i, j));
-                    constexpr auto dst_offset = DstDesc{}.CalculateOffset(make_tuple(i, j));
-                    *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
-                        *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
-                });
-            });
-        }
-    };
-    template <typename FloatA, typename FloatB, typename FloatC>
-    __device__ void
-    Run_naive(const FloatA* p_a_block, const FloatB* p_b_thread, FloatC* p_c_thread) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto a_block_mtx = BlockMatrixA{};
-        constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
-        constexpr auto KPerThreadSubC = 4;
-        constexpr auto HoPerThreadSubC = 2;
-        constexpr auto WoPerThreadSubC = 2;
-        static_assert(KPerThread % KPerThreadSubC == 0, "");
-        static_assert(HPerThread % HoPerThreadSubC == 0, "");
-        static_assert(WPerThread % WoPerThreadSubC == 0, "");
-        // thread A, B for GEMM
-        constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
-            make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadSubC>{}));
-        constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-            Number<EPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
-        constexpr auto c_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-            Number<KPerThreadSubC>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
-        FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()];
-        constexpr auto a_thread_copy = ThreadwiseSliceCopy_a<BlockMatrixA,
-                                                             decltype(a_thread_mtx),
-                                                             EPerThreadLoop,
-                                                             KPerThreadSubC,
-                                                             ThreadGemmADataPerRead_K>{};
-        constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v3<decltype(a_thread_mtx),
-                                                                    decltype(b_thread_mtx),
-                                                                    decltype(c_thread_mtx),
-                                                                    HoPerThreadSubC,
-                                                                    WoPerThreadSubC>{};
-        // loop over k
-#pragma unroll
-        for(index_t e_begin = 0; e_begin < EPerBlock; e_begin += EPerThreadLoop)
-        {
-#pragma unroll
-            for(index_t k_begin = 0; k_begin < KPerThread; k_begin += KPerThreadSubC)
-            {
-                a_thread_copy.Run(p_a_block +
-                                      a_block_mtx.CalculateOffset(make_tuple(e_begin, k_begin)) +
-                                      mMyThreadOffsetA,
-                                  p_a_thread);
-#pragma unroll
-                for(index_t h_begin = 0; h_begin < HPerThread; h_begin += HoPerThreadSubC)
-                {
-#pragma unroll
-                    for(index_t w_begin = 0; w_begin < WPerThread; w_begin += WoPerThreadSubC)
-                    {
-                        threadwise_gemm.Run(p_a_thread,
-                                            p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(
-                                                             e_begin, 0, h_begin, w_begin)),
-                                            p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(
-                                                             k_begin, 0, h_begin, w_begin)));
-                    }
-                }
-            }
-        }
-    }
-    template <typename FloatA, typename FloatB, typename FloatC>
-    __device__ void Run(const FloatA* p_a_block, const FloatB* p_b_thread, FloatC* p_c_thread) const
-    {
-        Run_naive(p_a_block, p_b_thread, p_c_thread);
-    }
-};
-#else
 // blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
 // A and B are visable to the whole block, C is distributed among each thread
 // If following number are power of 2, index calculation shall be greatly reduced:
@@ -379,7 +184,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
        });
    }
 };
-#endif
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
@@ -11,459 +11,6 @@
 namespace ck {
-#if 0
-template <index_t BlockSize,
-          typename FloatAB,
-          typename FloatAcc,
-          typename FloatC,
-          InMemoryDataOperation CGlobalMemoryDataOperation,
-          typename AGlobalDesc,
-          typename BGlobalDesc,
-          typename CGlobalDesc,
-          index_t KPerBlock,
-          index_t HoPerBlock,
-          index_t WoPerBlock,
-          index_t EPerBlock,
-          index_t KPerThread,
-          index_t HoPerThread,
-          index_t WoPerThread,
-          index_t EPerThread,
-          typename ABlockTransferThreadSliceLengths_E_K,
-          typename ABlockTransferThreadClusterLengths_E_K,
-          typename ABlockTransferThreadClusterArrangeOrder,
-          typename ABlockTransferSrcAccessOrder,
-          index_t ABlockTransferSrcVectorDim,
-          index_t ABlockTransferSrcScalarPerVector,
-          index_t ABlockTransferDstScalarPerVector_K,
-          bool AThreadTransferSrcResetCoordinateAfterRun,
-          typename BBlockTransferSrcAccessOrder,
-          index_t BBlockTransferSrcVectorDim,
-          index_t BBlockTransferSrcScalarPerVector,
-          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
-          index_t CThreadTransferSrcDstVectorDim,
-          index_t CThreadTransferDstScalarPerVector,
-          typename AGlobalIteratorHacks,
-          typename BGlobalIteratorHacks,
-          typename CGlobalIteratorHacks,
-          typename AGlobalMoveSliceWindowIteratorHacks,
-          typename BGlobalMoveSliceWindowIteratorHacks>
-struct GridwiseDynamicGemm_km_kn_mn_v3
-{
-    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
-    {
-        constexpr auto E = EPerBlock * 3 * 3;
-        constexpr auto max_lds_align =
-            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
-        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
-            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_space_size =
-            math::integer_least_multiple(a_e_k_desc.GetElementSpaceSize(), max_lds_align);
-        return a_block_space_size * sizeof(FloatAB);
-    }
-    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
-    __device__ void Run(const AGlobalDesc& a_e_k_global_desc,
-                        const FloatAB* __restrict__ p_a_global,
-                        const BGlobalDesc& b_e_n_ho_wo_global_desc,
-                        const FloatAB* __restrict__ p_b_global,
-                        const CGlobalDesc& c_k_n_ho_wo_global_desc,
-                        FloatC* __restrict__ p_c_global,
-                        FloatAB* __restrict__ p_shared_block,
-                        integral_constant<bool, HasMainKBlockLoop>,
-                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto E = EPerBlock * 3 * 3;
-        // const auto E = a_e_k_global_desc.GetLength(I0);
-        const auto K = a_e_k_global_desc.GetLength(I1);
-        const auto N  = b_e_n_ho_wo_global_desc.GetLength(I1);
-        const auto Ho = b_e_n_ho_wo_global_desc.GetLength(I2);
-        const auto Wo = b_e_n_ho_wo_global_desc.GetLength(I3);
-        // divide block work by [M, N]
-#if 0
-        const auto k_block_work_num   = K / Number<KPerBlock>{};
-        const auto ho_block_work_num  = Ho / Number<HoPerBlock>{};
-        const auto wo_block_work_num  = Wo / Number<WoPerBlock>{};
-        const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
-        const index_t k_block_work_id   = get_block_1d_id() / hwo_block_work_num;
-        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
-        const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
-        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
-#else
-        // Hack: this force result into SGPR
-        const index_t k_block_work_num   = __builtin_amdgcn_readfirstlane(K / KPerBlock);
-        const index_t ho_block_work_num  = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
-        const index_t wo_block_work_num  = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
-        const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
-        const index_t k_block_work_id =
-            __builtin_amdgcn_readfirstlane(get_block_1d_id() / hwo_block_work_num);
-        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
-        const index_t ho_block_work_id =
-            __builtin_amdgcn_readfirstlane(hwo_block_work_id / wo_block_work_num);
-        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
-#endif
-        // lds max alignment
-        constexpr auto max_lds_align =
-            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
-        // A matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto a_e_k_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
-            make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
-        constexpr auto a_e_k_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
-            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
-        // B matrix in LDS memory, dst of blockwise copy
-        //   be careful of LDS alignment
-        constexpr auto b_e_n_ho_wo_block_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-                Number<EPerBlock>{}, Number<1>{}, Number<HoPerBlock>{}, Number<WoPerBlock>{}));
-        // c_thread_mtx definition: this is a mess
-        // TODO:: more elegent way of defining c_thread_mtx
-        constexpr auto c_k_n_ho_wo_thread_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-                Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
-        const auto blockwise_gemm =
-            BlockwiseGemm_km_kn_m0m1n0n1_v3<BlockSize,
-                                            decltype(a_e_k_block_desc),
-                                            decltype(b_e_n_ho_wo_block_desc),
-                                            decltype(c_k_n_ho_wo_thread_desc),
-                                            KPerThread,
-                                            HoPerThread,
-                                            WoPerThread,
-                                            EPerThread,
-                                            ABlockTransferSrcScalarPerVector,
-                                            ABlockTransferDstScalarPerVector_K>{};
-        auto c_thread_mtx_index = blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
-        const auto k_thread_id  = c_thread_mtx_index.k;
-        const auto ho_thread_id = c_thread_mtx_index.h;
-        const auto wo_thread_id = c_thread_mtx_index.w;
-        const index_t k_block_data_on_global  = k_block_work_id * KPerBlock;
-        const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
-        const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
-        const index_t ho_thread_data_on_global =
-            ho_block_data_on_global + ho_thread_id * HoPerThread;
-        const index_t wo_thread_data_on_global =
-            wo_block_data_on_global + wo_thread_id * WoPerThread;
-        // A matrix blockwise copy
-        auto a_blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
-                                                   InMemoryDataOperation::Set,
-                                                   Sequence<E, KPerBlock>,
-                                                   ABlockTransferThreadSliceLengths_E_K,
-                                                   ABlockTransferThreadClusterLengths_E_K,
-                                                   ABlockTransferThreadClusterArrangeOrder,
-                                                   FloatAB,
-                                                   FloatAB,
-                                                   decltype(a_e_k_global_desc),
-                                                   decltype(a_e_k_desc),
-                                                   ABlockTransferSrcAccessOrder,
-                                                   Sequence<0, 1>,
-                                                   ABlockTransferSrcVectorDim,
-                                                   1,
-                                                   ABlockTransferSrcScalarPerVector,
-                                                   ABlockTransferDstScalarPerVector_K,
-                                                   AddressSpace::Global,
-                                                   AddressSpace::Lds,
-                                                   1,
-                                                   1,
-                                                   AThreadTransferSrcResetCoordinateAfterRun,
-                                                   true>(
-                a_e_k_global_desc,
-                make_multi_index(0, k_block_data_on_global),
-                a_e_k_desc,
-                make_multi_index(0, 0));
-        constexpr auto b_e_n_ho_wo_thread_desc =
-            make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-                Number<EPerBlock>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
-        auto b_threadwise_transfer = ThreadwiseDynamicTensorSliceTransfer_v2<
-            FloatAB,
-            FloatAB,
-            decltype(b_e_n_ho_wo_global_desc),
-            decltype(b_e_n_ho_wo_thread_desc),
-            Sequence<EPerBlock, 1, HoPerThread, WoPerThread>,
-            BBlockTransferSrcAccessOrder,
-            BBlockTransferSrcVectorDim,
-            BBlockTransferSrcScalarPerVector,
-            AddressSpace::Global,
-            AddressSpace::Vgpr,
-            InMemoryDataOperation::Set,
-            1,
-            true>(b_e_n_ho_wo_global_desc,
-                  make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global));
-        FloatAB* p_a_block = p_shared_block;
-        // register allocation for output
-        FloatAcc p_c_thread[c_k_n_ho_wo_thread_desc.GetElementSpaceSize()];
-        // zero out threadwise output
-        threadwise_matrix_set_zero_v3(c_k_n_ho_wo_thread_desc, p_c_thread);
-        constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);
-        // hack to control index calculation when iterating over A and B matrix for threadwise copy
-        constexpr auto a_e_k_global_iterator_hacks       = AGlobalIteratorHacks{};
-        constexpr auto b_e_n_ho_wo_global_iterator_hacks = BGlobalIteratorHacks{};
-        // hack to control index calculation when move slice window for A and B matrix for
-        // threadwise copy
-        constexpr auto a_e_k_global_move_slice_window_iterator_hack =
-            AGlobalMoveSliceWindowIteratorHacks{};
-        constexpr auto b_e_n_ho_wo_global_move_slice_window_iterator_hack =
-            BGlobalMoveSliceWindowIteratorHacks{};
-        constexpr auto b_thread_space_size = b_e_n_ho_wo_thread_desc.GetElementSpaceSize();
-        FloatAB p_b_thread[b_thread_space_size * 2];
-        FloatAB* p_b_thread_double = p_b_thread;
-        // LDS double buffer: preload data into LDS
-        {
-            a_blockwise_copy.RunRead(a_e_k_global_desc, p_a_global, a_e_k_global_iterator_hacks);
-            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
-                                      p_b_global,
-                                      b_e_n_ho_wo_thread_desc,
-                                      make_tuple(I0, I0, I0, I0),
-                                      p_b_thread_double,
-                                      b_e_n_ho_wo_global_iterator_hacks);
-            a_blockwise_copy.RunWrite(a_e_k_desc, p_a_block);
-        }
-        __syncthreads();
-        index_t b_block_data_begin = 0;
-        if constexpr(HasMainKBlockLoop)
-        {
-            FloatAB* p_b_thread_even = p_b_thread_double;
-            FloatAB* p_b_thread_odd  = p_b_thread_double + b_thread_space_size;
-            // LDS double buffer: main body
-            // use Do-While loop instead of For loop to simplify control flow
-            do
-            {
-                // even iteration
-                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
-                                                         b_thread_slice_copy_step);
-                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
-                                          p_b_global,
-                                          b_e_n_ho_wo_thread_desc,
-                                          make_tuple(I0, I0, I0, I0),
-                                          p_b_thread_odd,
-                                          b_e_n_ho_wo_global_iterator_hacks);
-                // LDS double buffer: GEMM on current data
-                blockwise_gemm.Run(
-                    p_a_block + a_e_k_block_desc.CalculateOffset(make_tuple(b_block_data_begin, 0)),
-                    p_b_thread_even,
-                    p_c_thread);
-                b_block_data_begin += EPerBlock;
-                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
-                                                         b_thread_slice_copy_step);
-                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
-                                          p_b_global,
-                                          b_e_n_ho_wo_thread_desc,
-                                          make_tuple(I0, I0, I0, I0),
-                                          p_b_thread_even,
-                                          b_e_n_ho_wo_global_iterator_hacks);
-                // LDS double buffer: GEMM on current data
-                blockwise_gemm.Run(
-                    p_a_block + a_e_k_block_desc.CalculateOffset(make_tuple(b_block_data_begin, 0)),
-                    p_b_thread_odd,
-                    p_c_thread);
-                b_block_data_begin += EPerBlock;
-            } while(b_block_data_begin < E - 2 * EPerBlock);
-        }
-        // LDS double buffer: tail
-        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
-        {
-            b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
-                                                     b_thread_slice_copy_step);
-            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
-                                      p_b_global,
-                                      b_e_n_ho_wo_thread_desc,
-                                      make_tuple(I0, I0, I0, I0),
-                                      p_b_thread_double + b_thread_space_size,
-                                      b_e_n_ho_wo_global_iterator_hacks);
-            // LDS double buffer: GEMM on 2nd-last data
-            blockwise_gemm.Run(
-                p_a_block + a_e_k_block_desc.CalculateOffset(make_tuple(b_block_data_begin, 0)),
-                p_b_thread_double,
-                p_c_thread);
-            b_block_data_begin += EPerBlock;
-            // LDS double buffer: GEMM on last data
-            blockwise_gemm.Run(
-                p_a_block + a_e_k_block_desc.CalculateOffset(make_tuple(b_block_data_begin, 0)),
-                p_b_thread_double + b_thread_space_size,
-                p_c_thread);
-        }
-        else // if has 1 iteration left
-        {
-            // LDS double buffer: GEMM on last data
-            blockwise_gemm.Run(
-                p_a_block + a_e_k_block_desc.CalculateOffset(make_tuple(b_block_data_begin, 0)),
-                p_b_thread_double,
-                p_c_thread);
-        }
-        // output: register to global memory
-        {
-            // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
-            constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
-            const index_t k_thread_data_on_global =
-                k_block_data_on_global + k_thread_id * KPerThread;
-            ThreadwiseDynamicTensorSliceTransfer_v1r3<
-                FloatAcc,
-                FloatC,
-                decltype(c_k_n_ho_wo_thread_desc),
-                decltype(c_k_n_ho_wo_global_desc),
-                Sequence<KPerThread, 1, HoPerThread, WoPerThread>,
-                CThreadTransferSrcDstAccessOrder,
-                CThreadTransferSrcDstVectorDim,
-                CThreadTransferDstScalarPerVector,
-                AddressSpace::Vgpr,
-                AddressSpace::Global,
-                CGlobalMemoryDataOperation,
-                1,
-                true>(
-                c_k_n_ho_wo_global_desc,
-                make_multi_index(
-                    k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global))
-                .Run(c_k_n_ho_wo_thread_desc,
-                     make_tuple(I0, I0, I0, I0),
-                     p_c_thread,
-                     c_k_n_ho_wo_global_desc,
-                     p_c_global,
-                     c_k_n_ho_wo_global_tensor_iterator_hacks);
-        }
-    }
-    // pass tensor descriptor by reference
-    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
-    __device__ void Run(const AGlobalDesc& a_e_k_global_desc,
-                        const FloatAB* __restrict__ p_a_global,
-                        const BGlobalDesc& b_e_n_ho_wo_global_desc,
-                        const FloatAB* __restrict__ p_b_global,
-                        const CGlobalDesc& c_k_n_ho_wo_global_desc,
-                        FloatC* __restrict__ p_c_global,
-                        integral_constant<bool, HasMainKBlockLoop>,
-                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
-    {
-        constexpr index_t shared_block_size = GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
-        __shared__ FloatAB p_shared_block[shared_block_size];
-        Run(a_e_k_global_desc,
-            p_a_global,
-            b_e_n_ho_wo_global_desc,
-            p_b_global,
-            c_k_n_ho_wo_global_desc,
-            p_c_global,
-            p_shared_block,
-            integral_constant<bool, HasMainKBlockLoop>{},
-            integral_constant<bool, HasDoubleTailKBlockLoop>{});
-    }
-    // pass tensor descriptors by their pointers
-    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
-    __device__ void Run(const AGlobalDesc* p_a_e_k_global_desc,
-                        const FloatAB* __restrict__ p_a_global,
-                        const BGlobalDesc* p_b_e_n_ho_wo_global_desc,
-                        const FloatAB* __restrict__ p_b_global,
-                        const CGlobalDesc* p_c_k_n_ho_wo_global_desc,
-                        FloatC* __restrict__ p_c_global,
-                        integral_constant<bool, HasMainKBlockLoop>,
-                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
-    {
-        const auto a_e_k_global_desc       = *p_a_e_k_global_desc;
-        const auto b_e_n_ho_wo_global_desc = *p_b_e_n_ho_wo_global_desc;
-        const auto c_k_n_ho_wo_global_desc = *p_c_k_n_ho_wo_global_desc;
-        Run(a_e_k_global_desc,
-            p_a_global,
-            b_e_n_ho_wo_global_desc,
-            p_b_global,
-            c_k_n_ho_wo_global_desc,
-            p_c_global,
-            integral_constant<bool, HasMainKBlockLoop>{},
-            integral_constant<bool, HasDoubleTailKBlockLoop>{});
-    }
-    // pass tensor descriptors by void*
-    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
-    __device__ void Run(const void* p_a_e_k_global_desc,
-                        const FloatAB* __restrict__ p_a_global,
-                        const void* p_b_e_n_ho_wo_global_desc,
-                        const FloatAB* __restrict__ p_b_global,
-                        const void* p_c_k_n_ho_wo_global_desc,
-                        FloatC* __restrict__ p_c_global,
-                        integral_constant<bool, HasMainKBlockLoop>,
-                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
-    {
-        const auto a_e_k_global_desc = *reinterpret_cast<const AGlobalDesc*>(p_a_e_k_global_desc);
-        const auto b_e_n_ho_wo_global_desc =
-            *reinterpret_cast<const BGlobalDesc*>(p_b_e_n_ho_wo_global_desc);
-        const auto c_k_n_ho_wo_global_desc =
-            *reinterpret_cast<const CGlobalDesc*>(p_c_k_n_ho_wo_global_desc);
-        Run(a_e_k_global_desc,
-            p_a_global,
-            b_e_n_ho_wo_global_desc,
-            p_b_global,
-            c_k_n_ho_wo_global_desc,
-            p_c_global,
-            integral_constant<bool, HasMainKBlockLoop>{},
-            integral_constant<bool, HasDoubleTailKBlockLoop>{});
-    }
-};
-#else
 template <index_t BlockSize,
          typename FloatAB,
          typename FloatAcc,
@@ -682,6 +229,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
        // register allocation for output
        StaticBuffer<FloatAcc, c_k_n_ho_wo_thread_desc.GetElementSpaceSize()> c_thread_buf;
+        // initialize output thread tensor
        ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
                                           decltype(c_k_n_ho_wo_thread_desc),
                                           Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
@@ -704,7 +252,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
        StaticBuffer<FloatAB, b_e_n_ho_wo_thread_desc.GetElementSpaceSize()> b_thread_even_buf,
            b_thread_odd_buf;
-        // LDS double buffer: preload data into LDS
+        // LDS double buffer: preload data
        {
            a_blockwise_copy.RunRead(a_e_k_global_desc, p_a_global, a_e_k_global_iterator_hacks);
@@ -923,7 +471,5 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
    }
 };
-#endif
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
@@ -6,170 +6,6 @@
 namespace ck {
-#if 0
-template <typename Float, typename Desc>
-__device__ void threadwise_matrix_set_zero_v2(Desc, Float* __restrict__ p_thread)
-{
-    static_assert(Desc::IsKnownAtCompileTime(), "wrong! Desc should be known at compile-time");
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto desc = Desc{};
-    constexpr auto M = desc.GetLength(I0);
-    constexpr auto N = desc.GetLength(I1);
-    static_for<0, M, 1>{}([&](auto i) {
-        static_for<0, N, 1>{}([&](auto j) {
-            constexpr auto offset = desc.CalculateOffset(make_tuple(i, j));
-            p_thread[offset] = Float(0);
-        });
-    });
-}
-template <typename SrcDesc,
-          typename DstDesc,
-          index_t NSliceRow,
-          index_t NSliceCol,
-          index_t DataPerAccess>
-struct ThreadwiseMatrixSliceCopy_v2
-{
-    template <typename Data>
-    __device__ static void Run(const Data* p_src, Data* p_dst)
-    {
-        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-        using vector_t = typename vector_type_maker<Data, DataPerAccess>::type::type;
-        static_for<0, NSliceRow, 1>{}([&](auto i) {
-            static_for<0, NSliceCol, DataPerAccess>{}([&](auto j) {
-                constexpr auto src_offset = SrcDesc{}.CalculateOffset(make_tuple(i, j));
-                constexpr auto dst_offset = DstDesc{}.CalculateOffset(make_tuple(i, j));
-                *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
-                    *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
-            });
-        });
-    }
-};
-// C[M, N] += transpose(A[K, M]) * B[K, N]
-//   Element of matrix can be vectorized data
-template <typename FloatA,
-          typename FloatB,
-          typename FloatC,
-          typename ADesc,
-          typename BDesc,
-          typename CDesc,
-          typename std::enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                                      CDesc::IsKnownAtCompileTime(),
-                                  bool>::type = false>
-struct ThreadwiseGemm_km_kn_mn_v1
-{
-    __device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
-    {
-        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                          CDesc::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto M = CDesc{}.GetLength(I0);
-        constexpr auto N = CDesc{}.GetLength(I1);
-        constexpr auto K = ADesc{}.GetLength(I0);
-        static_for<0, K, 1>{}([&](auto k) {
-            static_for<0, M, 1>{}([&](auto m) {
-                static_for<0, N, 1>{}([&](auto n) {
-                    constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(k, m));
-                    constexpr auto b_offset = BDesc{}.CalculateOffset(make_tuple(k, n));
-                    constexpr auto c_offset = CDesc{}.CalculateOffset(make_tuple(m, n));
-                    p_c[c_offset] +=
-                        inner_product_with_conversion<FloatC>{}(p_a[a_offset], p_b[b_offset]);
-                });
-            });
-        });
-    }
-#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
-    __device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
-    {
-        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                          CDesc::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto M = CDesc{}.GetLength(I0);
-        constexpr auto N = CDesc{}.GetLength(I1);
-        constexpr auto K = ADesc{}.GetLength(I0);
-        static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet");
-        static_for<0, K, 1>{}([&](auto k) {
-            static_for<0, M, 1>{}([&](auto m) {
-                constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(k, m));
-                if constexpr(N == 2)
-                {
-                    constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
-                    constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
-                    constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
-                    constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
-                    amd_assembly_outer_product_1x2(p_a[a_offset],
-                                                   p_b[b_offset_0],
-                                                   p_b[b_offset_1],
-                                                   p_c[c_offset_0],
-                                                   p_c[c_offset_1]);
-                }
-                else if constexpr(N == 4)
-                {
-                    constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
-                    constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
-                    constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(k, I2));
-                    constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(k, I3));
-                    constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
-                    constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
-                    constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(m, I2));
-                    constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(m, I3));
-                    amd_assembly_outer_product_1x4(p_a[a_offset],
-                                                   p_b[b_offset_0],
-                                                   p_b[b_offset_1],
-                                                   p_b[b_offset_2],
-                                                   p_b[b_offset_3],
-                                                   p_c[c_offset_0],
-                                                   p_c[c_offset_1],
-                                                   p_c[c_offset_2],
-                                                   p_c[c_offset_3]);
-                }
-            });
-        });
-    }
-#endif
-    __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
-    {
-#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
-        Run_amd_asm(p_a, p_b, p_c);
-#else
-        Run_source(p_a, p_b, p_c);
-#endif
-    }
-};
-#endif
 // C[M, N] += transpose(A[K, M]) * B[K, N]
 //   Element of matrix can be vectorized data
 // Assume:

--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
@@ -6,139 +6,6 @@
 namespace ck {
-#if 0
-template <typename Float, typename Desc>
-__device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread)
-{
-    static_assert(Desc::IsKnownAtCompileTime(), "wrong! Desc should be known at compile-time");
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    constexpr auto desc = Desc{};
-    constexpr auto K = desc.GetLength(I0);
-    constexpr auto H = desc.GetLength(I2);
-    constexpr auto W = desc.GetLength(I3);
-    static_for<0, K, 1>{}([&](auto i) {
-        static_for<0, H, 1>{}([&](auto j) {
-            static_for<0, W, 1>{}([&](auto k) {
-                constexpr auto offset = desc.CalculateOffset(make_tuple(i, 0, j, k));
-                p_thread[offset] = Float(0);
-            });
-        });
-    });
-}
-#endif
-#if 0
-// C[M, N] += transpose(A[K, M]) * B[K, N]
-//   Element of matrix can be vectorized data
-template <typename FloatA,
-          typename FloatB,
-          typename FloatC,
-          typename ADesc,
-          typename BDesc,
-          typename CDesc,
-          index_t H,
-          index_t W,
-          typename std::enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                                      CDesc::IsKnownAtCompileTime(),
-                                  bool>::type = false>
-struct ThreadwiseGemm_km_kn_mn_v3
-{
-    __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
-    {
-        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                          CDesc::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto E = ADesc{}.GetLength(I0);
-        constexpr auto K = ADesc{}.GetLength(I1);
-        static_for<0, E, 1>{}([&](auto e) {
-            static_for<0, K, 1>{}([&](auto k) {
-                constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
-                if constexpr(H == 2 && W == 2)
-                {
-                    constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
-                    constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
-                    constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
-                    constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 1));
-                    constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
-                    constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
-                    constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
-                    constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 1));
-                    amd_assembly_outer_product_1x4(p_a[Number<a_offset>{}],
-                                                   p_b[Number<b_offset_0>{}],
-                                                   p_b[Number<b_offset_1>{}],
-                                                   p_b[Number<b_offset_2>{}],
-                                                   p_b[Number<b_offset_3>{}],
-                                                   p_c[Number<c_offset_0>{}],
-                                                   p_c[Number<c_offset_1>{}],
-                                                   p_c[Number<c_offset_2>{}],
-                                                   p_c[Number<c_offset_3>{}]);
-                }
-                else if constexpr(H == 4 && W == 1)
-                {
-                    constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
-                    constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
-                    constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 2, 0));
-                    constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 3, 0));
-                    constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
-                    constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
-                    constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 2, 0));
-                    constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 3, 0));
-                    amd_assembly_outer_product_1x4(p_a[Number<a_offset>{}],
-                                                   p_b[Number<b_offset_0>{}],
-                                                   p_b[Number<b_offset_1>{}],
-                                                   p_b[Number<b_offset_2>{}],
-                                                   p_b[Number<b_offset_3>{}],
-                                                   p_c[Number<c_offset_0>{}],
-                                                   p_c[Number<c_offset_1>{}],
-                                                   p_c[Number<c_offset_2>{}],
-                                                   p_c[Number<c_offset_3>{}]);
-                }
-                else
-                {
-                    static_for<0, H, 1>{}([&](auto h) {
-                        static_for<0, W, 1>{}([&](auto w) {
-                            constexpr index_t b_offset =
-                                BDesc{}.CalculateOffset(make_tuple(e, 0, h, w));
-                            constexpr index_t c_offset =
-                                CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));
-#if 0
-                            p_c[Number<c_offset>{}] += inner_product_with_conversion<FloatC>{}(p_a[Number<a_offset>{}],
-                                                                                               p_b[Number<b_offset>{}]);
-#else
-                            amd_assembly_inner_product(p_a[Number<a_offset>{}],
-                                                       p_b[Number<b_offset>{}],
-                                                       p_c[Number<c_offset>{}]);
-#endif
-                        });
-                    });
-                }
-            });
-        });
-    }
-};
-#else
 // C[M, N] += transpose(A[K, M]) * B[K, N]
 //   Element of matrix can be vectorized data
 // Assume:
@@ -277,8 +144,8 @@ struct ThreadwiseGemm_km_kn_mn_v3
                                CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, h, w));
 #if 0
-                            c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(a_buf[Number<a_offset>{}],
+                            c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(
-                                                                                               b_buf[Number<b_offset>{}]);
+                                a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
 #else
                            amd_assembly_inner_product(a_buf[Number<a_offset>{}],
                                                       b_buf[Number<b_offset>{}],
@@ -291,7 +158,6 @@ struct ThreadwiseGemm_km_kn_mn_v3
        });
    }
 };
-#endif
 } // namespace ck
 #endif