clean up

08bb4372 · Chao Liu · 905f5a3f · 08bb4372 · 08bb4372 · 08bb4372
Commit 08bb4372 authored Apr 23, 2021 by Chao Liu
5 changed files
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_v2.hpp
@@ -7,366 +7,6 @@

 namespace ck {

-#if 0
-// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
-// A and B are visable to the whole block, C is distributed among each thread
-// If following number are power of 2, index calculation shall be greatly reduced:
-//    MPerThreadSubC, NPerThreadSubC, MLevel0ThreadCluster, NLevel0ThreadCluster,
-//    MLevel1ThreadCluster, NLevel1ThreadCluster
-template <index_t BlockSize,
-          typename FloatA,
-          typename FloatB,
-          typename FloatC,
-          typename BlockMatrixA,
-          typename BlockMatrixB,
-          typename ThreadMatrixC,
-          index_t MPerThreadSubC,
-          index_t NPerThreadSubC,
-          index_t KPerThreadLoop,
-          index_t MLevel0ThreadCluster,
-          index_t NLevel0ThreadCluster,
-          index_t MLevel1ThreadCluster,
-          index_t NLevel1ThreadCluster,
-          index_t ThreadGemmADataPerRead_M,
-          index_t ThreadGemmBDataPerRead_N>
-struct BlockwiseGemm_km_kn_m0m1n0n1_v1
-{
-    struct MatrixIndex
-    {
-        index_t row;
-        index_t col;
-    };
-
-    index_t mMyThreadOffsetA;
-    index_t mMyThreadOffsetB;
-
-    __device__ BlockwiseGemm_km_kn_m0m1n0n1_v1()
-    {
-        static_assert(BlockMatrixA::IsKnownAtCompileTime() &&
-                          BlockMatrixB::IsKnownAtCompileTime() &&
-                          ThreadMatrixC::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        constexpr index_t ThreadPerLevel1Cluster = MLevel0ThreadCluster * NLevel0ThreadCluster *
-                                                   MLevel1ThreadCluster * NLevel1ThreadCluster;
-
-        static_assert(BlockSize == ThreadPerLevel1Cluster, "wrong! wrong blocksize\n");
-
-        static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
-                      "wrong! K dimension not consistent\n");
-
-        constexpr index_t M = BlockMatrixA{}.GetLength(I1); // A is transposed
-        constexpr index_t N = BlockMatrixB{}.GetLength(I1);
-
-        static_assert(M % (MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster) == 0 &&
-                          N % (NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster) == 0,
-                      "wrong! Cannot evenly divide work among\n");
-
-        static_assert(ThreadMatrixC{}.GetLength(I0) == GetThreadMatrixCLengths()[I0] &&
-                          ThreadMatrixC{}.GetLength(I1) == GetThreadMatrixCLengths()[I1],
-                      "wrong! ThreadMatrixC lengths is wrong");
-
-        auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
-
-        mMyThreadOffsetA = BlockMatrixA{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.row));
-        mMyThreadOffsetB = BlockMatrixB{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.col));
-    }
-
-    __device__ static constexpr auto GetThreadMatrixCLengths()
-    {
-        constexpr auto I1 = Number<1>{};
-
-        constexpr index_t M = BlockMatrixA{}.GetLength(I1); // A is transposed
-        constexpr index_t N = BlockMatrixB{}.GetLength(I1);
-
-        constexpr index_t MRepeat =
-            M / (MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster);
-        constexpr index_t NRepeat =
-            N / (NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster);
-
-        return Sequence<MRepeat * MPerThreadSubC, NRepeat * NPerThreadSubC>{};
-    }
-
-    __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
-    {
-        constexpr index_t ThreadPerLevel0Cluster = MLevel0ThreadCluster * NLevel0ThreadCluster;
-
-        index_t level1_id   = thread_id / ThreadPerLevel0Cluster;
-        index_t level1_m_id = level1_id / NLevel1ThreadCluster;
-        index_t level1_n_id = level1_id % NLevel1ThreadCluster;
-
-        index_t level0_id   = thread_id % ThreadPerLevel0Cluster;
-        index_t level0_m_id = level0_id / NLevel0ThreadCluster;
-        index_t level0_n_id = level0_id % NLevel0ThreadCluster;
-
-        constexpr index_t MPerLevel0Cluster = MPerThreadSubC * MLevel0ThreadCluster;
-        constexpr index_t NPerLevel0Cluster = NPerThreadSubC * NLevel0ThreadCluster;
-
-        return MatrixIndex{level1_m_id * MPerLevel0Cluster + level0_m_id * MPerThreadSubC,
-                           level1_n_id * NPerLevel0Cluster + level0_n_id * NPerThreadSubC};
-    }
-
-    __device__ void
-    Run_naive(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        constexpr auto a_block_mtx  = BlockMatrixA{};
-        constexpr auto b_block_mtx  = BlockMatrixB{};
-        constexpr auto c_thread_mtx = ThreadMatrixC{};
-
-        constexpr auto K = a_block_mtx.GetLength(I0);
-
-        constexpr auto MPerThread = c_thread_mtx.GetLength(I0);
-        constexpr auto NPerThread = c_thread_mtx.GetLength(I1);
-
-        constexpr index_t MPerLevel1Cluster =
-            MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
-        constexpr index_t NPerLevel1Cluster =
-            NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
-
-        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
-        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
-
-        // thread A, B for GEMM
-        constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
-            make_tuple(Number<KPerThreadLoop>{}, Number<MPerThread>{}));
-
-        constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
-            make_tuple(Number<KPerThreadLoop>{}, Number<NPerThread>{}));
-
-        FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()];
-        FloatB p_b_thread[b_thread_mtx.GetElementSpaceSize()];
-
-        constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixA,
-                                                                    decltype(a_thread_mtx),
-                                                                    KPerThreadLoop,
-                                                                    MPerThreadSubC,
-                                                                    ThreadGemmADataPerRead_M>{};
-
-        constexpr auto b_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixB,
-                                                                    decltype(b_thread_mtx),
-                                                                    KPerThreadLoop,
-                                                                    NPerThreadSubC,
-                                                                    ThreadGemmBDataPerRead_N>{};
-
-        constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v1<FloatA,
-                                                                    FloatB,
-                                                                    FloatC,
-                                                                    decltype(a_thread_mtx),
-                                                                    decltype(b_thread_mtx),
-                                                                    decltype(c_thread_mtx)>{};
-        // loop over k
-        static_for<0, K, KPerThreadLoop>{}([&](auto k_begin) {
-            // read A
-            static_for<0, MRepeat, 1>{}([&](auto m_repeat) {
-                a_thread_copy.Run(p_a_block +
-                                      a_block_mtx.CalculateOffset(
-                                          make_tuple(k_begin, m_repeat * MPerLevel1Cluster)) +
-                                      mMyThreadOffsetA,
-                                  p_a_thread + a_thread_mtx.CalculateOffset(
-                                                   make_tuple(0, m_repeat * MPerThreadSubC)));
-            });
-
-            // read B
-            static_for<0, NRepeat, 1>{}([&](auto n_repeat) {
-                b_thread_copy.Run(p_b_block +
-                                      b_block_mtx.CalculateOffset(
-                                          make_tuple(k_begin, n_repeat * NPerLevel1Cluster)) +
-                                      mMyThreadOffsetB,
-                                  p_b_thread + b_thread_mtx.CalculateOffset(
-                                                   make_tuple(0, n_repeat * NPerThreadSubC)));
-            });
-
-            // C += A * B
-            threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
-        });
-    }
-
-    __device__ void
-    Run_pipelined_2x2(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        constexpr auto a_block_mtx  = BlockMatrixA{};
-        constexpr auto b_block_mtx  = BlockMatrixB{};
-        constexpr auto c_thread_mtx = ThreadMatrixC{};
-
-        constexpr auto K = a_block_mtx.GetLength(I0);
-
-        constexpr auto MPerThread = c_thread_mtx.GetLength(I0);
-        constexpr auto NPerThread = c_thread_mtx.GetLength(I1);
-
-        constexpr index_t MPerLevel1Cluster =
-            MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
-
-        constexpr index_t NPerLevel1Cluster =
-            NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
-
-        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
-        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
-
-        static_assert(MRepeat == 2 && NRepeat == 2,
-                      "wrong! inline asm cannot deal with this GEMM config yet");
-
-        // thread A, B
-        constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
-            make_tuple(Number<KPerThreadLoop>{}, Number<MPerThread>{}));
-
-        constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
-            make_tuple(Number<KPerThreadLoop>{}, Number<NPerThread>{}));
-
-        // thread A-sub, B-sub
-        constexpr auto a_thread_sub_mtx = make_dynamic_naive_tensor_descriptor_v2(
-            make_tuple(Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}),
-            make_tuple(Number<MPerThread>{}, Number<1>{}));
-
-        constexpr auto b_thread_sub_mtx = make_dynamic_naive_tensor_descriptor_v2(
-            make_tuple(Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}),
-            make_tuple(Number<NPerThread>{}, Number<1>{}));
-
-        constexpr auto c_thread_sub_mtx = make_dynamic_naive_tensor_descriptor_v2(
-            make_tuple(Number<MPerThreadSubC>{}, Number<NPerThreadSubC>{}),
-            make_tuple(Number<NPerThread>{}, Number<1>{}));
-
-        FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()];
-        FloatB p_b_thread[b_thread_mtx.GetElementSpaceSize()];
-
-        constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixA,
-                                                                    decltype(a_thread_mtx),
-                                                                    KPerThreadLoop,
-                                                                    MPerThreadSubC,
-                                                                    ThreadGemmADataPerRead_M>{};
-
-        constexpr auto b_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixB,
-                                                                    decltype(b_thread_mtx),
-                                                                    KPerThreadLoop,
-                                                                    NPerThreadSubC,
-                                                                    ThreadGemmBDataPerRead_N>{};
-
-        constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v1<FloatA,
-                                                                    FloatB,
-                                                                    FloatC,
-                                                                    decltype(a_thread_sub_mtx),
-                                                                    decltype(b_thread_sub_mtx),
-                                                                    decltype(c_thread_sub_mtx)>{};
-
-        const FloatA* p_a_block_off = p_a_block + mMyThreadOffsetA;
-        const FloatB* p_b_block_off = p_b_block + mMyThreadOffsetB;
-
-        // read A_sub_0
-        a_thread_copy.Run(p_a_block_off, p_a_thread);
-
-        // read B_sub_0
-        b_thread_copy.Run(p_b_block_off, p_b_thread);
-
-        // read B_sub_1
-        b_thread_copy.Run(p_b_block_off +
-                              b_block_mtx.CalculateOffset(make_tuple(0, NPerLevel1Cluster)),
-                          p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
-
-        // read A_sub_1
-        a_thread_copy.Run(p_a_block_off +
-                              a_block_mtx.CalculateOffset(make_tuple(0, MPerLevel1Cluster)),
-                          p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)));
-
-        // C_sub_00 += transpose(A_sub_0) * B_sub_0
-        threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
-
-        // C_sub_01 += transpose(A_sub_0) * B_sub_1
-        threadwise_gemm.Run(
-            p_a_thread,
-            p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
-            p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
-
-        // loop over rest of k
-        static_for<KPerThreadLoop, K, KPerThreadLoop>{}([&](auto k) {
-            // read A_sub_0
-            a_thread_copy.Run(p_a_block_off + a_block_mtx.CalculateOffset(make_tuple(k, 0)),
-                              p_a_thread);
-
-            // C_sub_10 += transpose(A_sub_1) * B_sub_0
-            threadwise_gemm.Run(
-                p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
-                p_b_thread,
-                p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, 0)));
-
-            // read B_sub_0
-            b_thread_copy.Run(p_b_block_off + b_block_mtx.CalculateOffset(make_tuple(k, 0)),
-                              p_b_thread);
-
-            // C_sub_11 += transpose(A_sub_1) * B_sub_1
-            threadwise_gemm.Run(
-                p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
-                p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
-                p_c_thread +
-                    c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, NPerThreadSubC)));
-
-            // read B_sub_1
-            b_thread_copy.Run(
-                p_b_block_off + b_block_mtx.CalculateOffset(make_tuple(k, NPerLevel1Cluster)),
-                p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
-
-            // read A_sub_1
-            a_thread_copy.Run(
-                p_a_block_off + a_block_mtx.CalculateOffset(make_tuple(k, MPerLevel1Cluster)),
-                p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)));
-
-            // C_sub_00 += transpose(A_sub_0) * B_sub_0
-            threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
-
-            // C_sub_01 += transpose(A_sub_0) * B_sub_1
-            threadwise_gemm.Run(
-                p_a_thread,
-                p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
-                p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
-        });
-
-        // C_sub_10 += transpose(A_sub_1) * B_sub_0
-        threadwise_gemm.Run(
-            p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
-            p_b_thread,
-            p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, 0)));
-
-        // C_sub_11 += transpose(A_sub_1) * B_sub_1
-        threadwise_gemm.Run(
-            p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
-            p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
-            p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, NPerThreadSubC)));
-    }
-
-    __device__ void Run(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
-    {
-#if CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        constexpr index_t MPerThread = ThreadMatrixC{}.GetLength(I0);
-        constexpr index_t NPerThread = ThreadMatrixC{}.GetLength(I1);
-
-        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
-        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
-
-        if constexpr(MRepeat == 2 && NRepeat == 2)
-        {
-            Run_pipelined_2x2(p_a_block, p_b_block, p_c_thread);
-        }
-        else
-        {
-            Run_naive(p_a_block, p_b_block, p_c_thread);
-        }
-#else
-        Run_naive(p_a_block, p_b_block, p_c_thread);
-#endif
-    }
-};
-#endif
-
 // C[M, N] += transpose(A[K, M]) * B[K, N]
 // A and B are visable to the whole block, C is distributed among each thread
 // Assume:

--- a/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
@@ -6,201 +6,6 @@

 namespace ck {

-#if 0
-// blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
-// A and B are visable to the whole block, C is distributed among each thread
-// If following number are power of 2, index calculation shall be greatly reduced:
-//    KPerThread, HPerThread, MLevel0ThreadCluster, NLevel0ThreadCluster,
-//    MLevel1ThreadCluster, NLevel1ThreadCluster
-template <index_t BlockSize,
-          typename BlockMatrixA,
-          typename BlockMatrixB,
-          typename ThreadMatrixC,
-          index_t KPerThread,
-          index_t HPerThread,
-          index_t WPerThread,
-          index_t EPerThreadLoop,
-          index_t ThreadGemmADataPerRead_K,
-          index_t ThreadGemmBDataPerRead_W>
-struct BlockwiseGemm_km_kn_m0m1n0n1_v3
-{
-    struct MatrixIndex
-    {
-        index_t k;
-        index_t h;
-        index_t w;
-    };
-
-    index_t mMyThreadOffsetA;
-
-    __device__ BlockwiseGemm_km_kn_m0m1n0n1_v3()
-    {
-        static_assert(BlockMatrixA::IsKnownAtCompileTime() &&
-                          BlockMatrixB::IsKnownAtCompileTime() &&
-                          ThreadMatrixC::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
-                      "wrong! K dimension not consistent\n");
-
-        constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed
-        constexpr index_t N = BlockMatrixB{}.GetLength(I1);
-        constexpr index_t H = BlockMatrixB{}.GetLength(I2);
-        constexpr index_t W = BlockMatrixB{}.GetLength(I3);
-
-        static_assert(K % KPerThread == 0 && H % HPerThread == 0 && W % WPerThread == 0,
-                      "wrong! Cannot evenly divide work among\n");
-
-        constexpr auto KThreadCluster = K / KPerThread;
-        constexpr auto HThreadCluster = H / HPerThread;
-        constexpr auto WThreadCluster = W / WPerThread;
-
-        static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster,
-                      "wrong! wrong blocksize\n");
-
-        auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
-
-        mMyThreadOffsetA =
-            BlockMatrixA{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.k * KPerThread));
-    }
-
-    __device__ static constexpr auto GetThreadMatrixCLengths()
-    {
-        return Sequence<KPerThread, 1, HPerThread, WPerThread>{};
-    }
-
-    __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
-    {
-        constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{});
-        constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{});
-
-        constexpr auto num_w_threads  = W / WPerThread;
-        constexpr auto num_h_threads  = H / HPerThread;
-        constexpr auto num_hw_threads = num_w_threads * num_h_threads;
-
-        index_t k_thread_id  = thread_id / num_hw_threads;
-        index_t hw_thread_id = thread_id % num_hw_threads;
-
-        index_t h_thread_id = hw_thread_id / num_w_threads;
-        index_t w_thread_id = hw_thread_id % num_w_threads;
-
-        return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
-    }
-
-    template <typename SrcDesc,
-              typename DstDesc,
-              index_t NSliceRow,
-              index_t NSliceCol,
-              index_t DataPerAccess>
-    struct ThreadwiseSliceCopy_a
-    {
-        template <typename Data>
-        __device__ static void Run(const Data* p_src, Data* p_dst)
-        {
-            static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
-                          "wrong! Desc should be known at compile-time");
-
-            using vector_t = typename vector_type_maker<Data, DataPerAccess>::type::type;
-
-            static_for<0, NSliceRow, 1>{}([&](auto i) {
-                static_for<0, NSliceCol, DataPerAccess>{}([&](auto j) {
-                    constexpr auto src_offset = SrcDesc{}.CalculateOffset(make_tuple(i, j));
-                    constexpr auto dst_offset = DstDesc{}.CalculateOffset(make_tuple(i, j));
-
-                    *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
-                        *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
-                });
-            });
-        }
-    };
-
-    template <typename FloatA, typename FloatB, typename FloatC>
-    __device__ void
-    Run_naive(const FloatA* p_a_block, const FloatB* p_b_thread, FloatC* p_c_thread) const
-    {
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        constexpr auto a_block_mtx = BlockMatrixA{};
-
-        constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
-
-        constexpr auto KPerThreadSubC = 4;
-
-        constexpr auto HoPerThreadSubC = 2;
-        constexpr auto WoPerThreadSubC = 2;
-
-        static_assert(KPerThread % KPerThreadSubC == 0, "");
-        static_assert(HPerThread % HoPerThreadSubC == 0, "");
-        static_assert(WPerThread % WoPerThreadSubC == 0, "");
-
-        // thread A, B for GEMM
-        constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
-            make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadSubC>{}));
-
-        constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-            Number<EPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
-
-        constexpr auto c_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
-            Number<KPerThreadSubC>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
-
-        FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()];
-
-        constexpr auto a_thread_copy = ThreadwiseSliceCopy_a<BlockMatrixA,
-                                                             decltype(a_thread_mtx),
-                                                             EPerThreadLoop,
-                                                             KPerThreadSubC,
-                                                             ThreadGemmADataPerRead_K>{};
-
-        constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v3<decltype(a_thread_mtx),
-                                                                    decltype(b_thread_mtx),
-                                                                    decltype(c_thread_mtx),
-                                                                    HoPerThreadSubC,
-                                                                    WoPerThreadSubC>{};
-        // loop over k
-#pragma unroll
-        for(index_t e_begin = 0; e_begin < EPerBlock; e_begin += EPerThreadLoop)
-        {
-#pragma unroll
-            for(index_t k_begin = 0; k_begin < KPerThread; k_begin += KPerThreadSubC)
-            {
-                a_thread_copy.Run(p_a_block +
-                                      a_block_mtx.CalculateOffset(make_tuple(e_begin, k_begin)) +
-                                      mMyThreadOffsetA,
-                                  p_a_thread);
-
-#pragma unroll
-                for(index_t h_begin = 0; h_begin < HPerThread; h_begin += HoPerThreadSubC)
-                {
-#pragma unroll
-                    for(index_t w_begin = 0; w_begin < WPerThread; w_begin += WoPerThreadSubC)
-                    {
-                        threadwise_gemm.Run(p_a_thread,
-                                            p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(
-                                                             e_begin, 0, h_begin, w_begin)),
-                                            p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(
-                                                             k_begin, 0, h_begin, w_begin)));
-                    }
-                }
-            }
-        }
-    }
-
-    template <typename FloatA, typename FloatB, typename FloatC>
-    __device__ void Run(const FloatA* p_a_block, const FloatB* p_b_thread, FloatC* p_c_thread) const
-    {
-        Run_naive(p_a_block, p_b_thread, p_c_thread);
-    }
-};
-#else
 // blockwise GEMM: C[M, N] += transpose(A[K, M]) * B[K, N]
 // A and B are visable to the whole block, C is distributed among each thread
 // If following number are power of 2, index calculation shall be greatly reduced:
@@ -379,7 +184,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
        });
    }
 };
-#endif

 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v2.hpp
@@ -6,170 +6,6 @@

 namespace ck {

-#if 0
-template <typename Float, typename Desc>
-__device__ void threadwise_matrix_set_zero_v2(Desc, Float* __restrict__ p_thread)
-{
-    static_assert(Desc::IsKnownAtCompileTime(), "wrong! Desc should be known at compile-time");
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-
-    constexpr auto desc = Desc{};
-
-    constexpr auto M = desc.GetLength(I0);
-    constexpr auto N = desc.GetLength(I1);
-
-    static_for<0, M, 1>{}([&](auto i) {
-        static_for<0, N, 1>{}([&](auto j) {
-            constexpr auto offset = desc.CalculateOffset(make_tuple(i, j));
-
-            p_thread[offset] = Float(0);
-        });
-    });
-}
-
-template <typename SrcDesc,
-          typename DstDesc,
-          index_t NSliceRow,
-          index_t NSliceCol,
-          index_t DataPerAccess>
-struct ThreadwiseMatrixSliceCopy_v2
-{
-    template <typename Data>
-    __device__ static void Run(const Data* p_src, Data* p_dst)
-    {
-        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-
-        using vector_t = typename vector_type_maker<Data, DataPerAccess>::type::type;
-
-        static_for<0, NSliceRow, 1>{}([&](auto i) {
-            static_for<0, NSliceCol, DataPerAccess>{}([&](auto j) {
-                constexpr auto src_offset = SrcDesc{}.CalculateOffset(make_tuple(i, j));
-                constexpr auto dst_offset = DstDesc{}.CalculateOffset(make_tuple(i, j));
-
-                *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
-                    *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
-            });
-        });
-    }
-};
-
-// C[M, N] += transpose(A[K, M]) * B[K, N]
-//   Element of matrix can be vectorized data
-template <typename FloatA,
-          typename FloatB,
-          typename FloatC,
-          typename ADesc,
-          typename BDesc,
-          typename CDesc,
-          typename std::enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                                      CDesc::IsKnownAtCompileTime(),
-                                  bool>::type = false>
-struct ThreadwiseGemm_km_kn_mn_v1
-{
-    __device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
-    {
-        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                          CDesc::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        constexpr auto M = CDesc{}.GetLength(I0);
-        constexpr auto N = CDesc{}.GetLength(I1);
-        constexpr auto K = ADesc{}.GetLength(I0);
-
-        static_for<0, K, 1>{}([&](auto k) {
-            static_for<0, M, 1>{}([&](auto m) {
-                static_for<0, N, 1>{}([&](auto n) {
-                    constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(k, m));
-                    constexpr auto b_offset = BDesc{}.CalculateOffset(make_tuple(k, n));
-                    constexpr auto c_offset = CDesc{}.CalculateOffset(make_tuple(m, n));
-
-                    p_c[c_offset] +=
-                        inner_product_with_conversion<FloatC>{}(p_a[a_offset], p_b[b_offset]);
-                });
-            });
-        });
-    }
-
-#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
-    __device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
-    {
-        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                          CDesc::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        constexpr auto M = CDesc{}.GetLength(I0);
-        constexpr auto N = CDesc{}.GetLength(I1);
-        constexpr auto K = ADesc{}.GetLength(I0);
-
-        static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet");
-
-        static_for<0, K, 1>{}([&](auto k) {
-            static_for<0, M, 1>{}([&](auto m) {
-                constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(k, m));
-
-                if constexpr(N == 2)
-                {
-                    constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
-                    constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
-
-                    constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
-                    constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
-
-                    amd_assembly_outer_product_1x2(p_a[a_offset],
-                                                   p_b[b_offset_0],
-                                                   p_b[b_offset_1],
-                                                   p_c[c_offset_0],
-                                                   p_c[c_offset_1]);
-                }
-                else if constexpr(N == 4)
-                {
-                    constexpr auto b_offset_0 = BDesc{}.CalculateOffset(make_tuple(k, I0));
-                    constexpr auto b_offset_1 = BDesc{}.CalculateOffset(make_tuple(k, I1));
-                    constexpr auto b_offset_2 = BDesc{}.CalculateOffset(make_tuple(k, I2));
-                    constexpr auto b_offset_3 = BDesc{}.CalculateOffset(make_tuple(k, I3));
-
-                    constexpr auto c_offset_0 = CDesc{}.CalculateOffset(make_tuple(m, I0));
-                    constexpr auto c_offset_1 = CDesc{}.CalculateOffset(make_tuple(m, I1));
-                    constexpr auto c_offset_2 = CDesc{}.CalculateOffset(make_tuple(m, I2));
-                    constexpr auto c_offset_3 = CDesc{}.CalculateOffset(make_tuple(m, I3));
-
-                    amd_assembly_outer_product_1x4(p_a[a_offset],
-                                                   p_b[b_offset_0],
-                                                   p_b[b_offset_1],
-                                                   p_b[b_offset_2],
-                                                   p_b[b_offset_3],
-                                                   p_c[c_offset_0],
-                                                   p_c[c_offset_1],
-                                                   p_c[c_offset_2],
-                                                   p_c[c_offset_3]);
-                }
-            });
-        });
-    }
-#endif
-
-    __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
-    {
-#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
-        Run_amd_asm(p_a, p_b, p_c);
-#else
-        Run_source(p_a, p_b, p_c);
-#endif
-    }
-};
-#endif
-
 // C[M, N] += transpose(A[K, M]) * B[K, N]
 //   Element of matrix can be vectorized data
 // Assume:

--- a/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
@@ -6,139 +6,6 @@

 namespace ck {

-#if 0
-template <typename Float, typename Desc>
-__device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread)
-{
-    static_assert(Desc::IsKnownAtCompileTime(), "wrong! Desc should be known at compile-time");
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto desc = Desc{};
-
-    constexpr auto K = desc.GetLength(I0);
-    constexpr auto H = desc.GetLength(I2);
-    constexpr auto W = desc.GetLength(I3);
-
-    static_for<0, K, 1>{}([&](auto i) {
-        static_for<0, H, 1>{}([&](auto j) {
-            static_for<0, W, 1>{}([&](auto k) {
-                constexpr auto offset = desc.CalculateOffset(make_tuple(i, 0, j, k));
-
-                p_thread[offset] = Float(0);
-            });
-        });
-    });
-}
-#endif
-
-#if 0
-// C[M, N] += transpose(A[K, M]) * B[K, N]
-//   Element of matrix can be vectorized data
-template <typename FloatA,
-          typename FloatB,
-          typename FloatC,
-          typename ADesc,
-          typename BDesc,
-          typename CDesc,
-          index_t H,
-          index_t W,
-          typename std::enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                                      CDesc::IsKnownAtCompileTime(),
-                                  bool>::type = false>
-struct ThreadwiseGemm_km_kn_mn_v3
-{
-    __device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
-    {
-        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                          CDesc::IsKnownAtCompileTime(),
-                      "wrong! Desc should be known at compile-time");
-
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        constexpr auto E = ADesc{}.GetLength(I0);
-        constexpr auto K = ADesc{}.GetLength(I1);
-
-        static_for<0, E, 1>{}([&](auto e) {
-            static_for<0, K, 1>{}([&](auto k) {
-                constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
-
-                if constexpr(H == 2 && W == 2)
-                {
-
-                    constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
-                    constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 1));
-                    constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
-                    constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 1));
-
-                    constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
-                    constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 1));
-                    constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
-                    constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 1));
-
-                    amd_assembly_outer_product_1x4(p_a[Number<a_offset>{}],
-                                                   p_b[Number<b_offset_0>{}],
-                                                   p_b[Number<b_offset_1>{}],
-                                                   p_b[Number<b_offset_2>{}],
-                                                   p_b[Number<b_offset_3>{}],
-                                                   p_c[Number<c_offset_0>{}],
-                                                   p_c[Number<c_offset_1>{}],
-                                                   p_c[Number<c_offset_2>{}],
-                                                   p_c[Number<c_offset_3>{}]);
-                }
-                else if constexpr(H == 4 && W == 1)
-                {
-
-                    constexpr index_t b_offset_0 = BDesc{}.CalculateOffset(make_tuple(e, 0, 0, 0));
-                    constexpr index_t b_offset_1 = BDesc{}.CalculateOffset(make_tuple(e, 0, 1, 0));
-                    constexpr index_t b_offset_2 = BDesc{}.CalculateOffset(make_tuple(e, 0, 2, 0));
-                    constexpr index_t b_offset_3 = BDesc{}.CalculateOffset(make_tuple(e, 0, 3, 0));
-
-                    constexpr index_t c_offset_0 = CDesc{}.CalculateOffset(make_tuple(k, 0, 0, 0));
-                    constexpr index_t c_offset_1 = CDesc{}.CalculateOffset(make_tuple(k, 0, 1, 0));
-                    constexpr index_t c_offset_2 = CDesc{}.CalculateOffset(make_tuple(k, 0, 2, 0));
-                    constexpr index_t c_offset_3 = CDesc{}.CalculateOffset(make_tuple(k, 0, 3, 0));
-
-                    amd_assembly_outer_product_1x4(p_a[Number<a_offset>{}],
-                                                   p_b[Number<b_offset_0>{}],
-                                                   p_b[Number<b_offset_1>{}],
-                                                   p_b[Number<b_offset_2>{}],
-                                                   p_b[Number<b_offset_3>{}],
-                                                   p_c[Number<c_offset_0>{}],
-                                                   p_c[Number<c_offset_1>{}],
-                                                   p_c[Number<c_offset_2>{}],
-                                                   p_c[Number<c_offset_3>{}]);
-                }
-                else
-                {
-                    static_for<0, H, 1>{}([&](auto h) {
-                        static_for<0, W, 1>{}([&](auto w) {
-                            constexpr index_t b_offset =
-                                BDesc{}.CalculateOffset(make_tuple(e, 0, h, w));
-                            constexpr index_t c_offset =
-                                CDesc{}.CalculateOffset(make_tuple(k, 0, h, w));
-
-#if 0
-                            p_c[Number<c_offset>{}] += inner_product_with_conversion<FloatC>{}(p_a[Number<a_offset>{}],
-                                                                                               p_b[Number<b_offset>{}]);
-#else
-                            amd_assembly_inner_product(p_a[Number<a_offset>{}],
-                                                       p_b[Number<b_offset>{}],
-                                                       p_c[Number<c_offset>{}]);
-#endif
-                        });
-                    });
-                }
-            });
-        });
-    }
-};
-#else
 // C[M, N] += transpose(A[K, M]) * B[K, N]
 //   Element of matrix can be vectorized data
 // Assume:
@@ -277,8 +144,8 @@ struct ThreadwiseGemm_km_kn_mn_v3
                                CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, h, w));

 #if 0
-                            c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(a_buf[Number<a_offset>{}],
-                                                                                               b_buf[Number<b_offset>{}]);
+                            c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(
+                                a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
 #else
                            amd_assembly_inner_product(a_buf[Number<a_offset>{}],
                                                       b_buf[Number<b_offset>{}],
@@ -291,7 +158,6 @@ struct ThreadwiseGemm_km_kn_mn_v3
        });
    }
 };
-#endif

 } // namespace ck
 #endif