Merge branch 'develop' into conv_splitk_f32

a037693f · ltqin · 0694d6ed · 4041850f · a037693f · a037693f
Commit a037693f authored Dec 01, 2021 by ltqin
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -200,3 +200,4 @@ enable_cppcheck(
 add_subdirectory(host)
 add_subdirectory(example)
 add_subdirectory(profiler)
+add_subdirectory(test)
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
@@ -10,99 +10,99 @@ template <index_t BlockSize,
          typename FloatA,
          typename FloatB,
          typename FloatC,
-          typename BlockMatrixA,
+          typename ABlockDesc_E1_K1_E2,
-          typename BlockMatrixB,
+          typename BBlockDesc_E1_N_Ho_Wo_E2,
-          typename ThreadMatrixC,
+          typename CThreadDesc_K_N_Ho_Wo,
-          index_t KPerThread,
-          index_t HPerThread,
-          index_t WPerThread,
          index_t EPerThreadLoop,
-          index_t ThreadGemmADataPerRead_K,
+          index_t KPerThreadLoop>
-          index_t ThreadGemmBDataPerRead_W>
 struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
 {
-    struct MatrixIndex
+    static constexpr auto I0 = Number<0>{};
-    {
+    static constexpr auto I1 = Number<1>{};
-        index_t k;
+    static constexpr auto I2 = Number<2>{};
-        index_t h;
+    static constexpr auto I3 = Number<3>{};
-        index_t w;
+    static constexpr auto I4 = Number<4>{};
-    };
+    using AIndex = MultiIndex<3>;
+    using BIndex = MultiIndex<3>;
+    using CIndex = MultiIndex<4>;
+    static constexpr auto E1        = ABlockDesc_E1_K1_E2{}.GetLength(I0);
+    static constexpr auto KPerBlock = ABlockDesc_E1_K1_E2{}.GetLength(I1);
+    static constexpr auto E2        = ABlockDesc_E1_K1_E2{}.GetLength(I2);
-    // HACK: fix this @Jing Zhang
+    static constexpr auto HoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I2);
-    static constexpr index_t KPerThreadSubC = 4;
+    static constexpr auto WoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I3);
+    static constexpr auto KPerThread  = CThreadDesc_K_N_Ho_Wo{}.GetLength(I0);
+    static constexpr auto HoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I2);
+    static constexpr auto WoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I3);
    static constexpr auto a_thread_mtx_ = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadSubC>{}));
+        make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadLoop>{}, Number<E2>{}));
-    static constexpr auto b_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
+    static constexpr auto b_thread_mtx_ =
-        Number<EPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
+        make_naive_tensor_descriptor_packed(make_tuple(Number<EPerThreadLoop>{},
+                                                       Number<1>{},
+                                                       Number<HoPerThread>{},
+                                                       Number<WoPerThread>{},
+                                                       Number<E2>{}));
    static constexpr auto c_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
-        Number<KPerThreadSubC>{}, Number<1>{}, Number<HPerThread>{}, Number<WPerThread>{}));
+        Number<KPerThreadLoop>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
-    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
-                                                         FloatA,
-                                                         BlockMatrixA,
-                                                         decltype(a_thread_mtx_),
-                                                         Sequence<EPerThreadLoop, KPerThreadSubC>,
-                                                         Sequence<0, 1>,
-                                                         1,
-                                                         ThreadGemmADataPerRead_K,
-                                                         1>;
    __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3()
-        : c_thread_begin_mtx_idx_{GetBeginOfThreadMatrixC(get_thread_local_1d_id())},
+        : c_thread_origin_data_idx_{GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id())},
-          a_thread_copy_{make_tuple(0, c_thread_begin_mtx_idx_.k * KPerThread)}
+          a_thread_copy_{make_tuple(0, c_thread_origin_data_idx_[I0] * KPerThread, 0)}
    {
-        static_assert(BlockMatrixA::IsKnownAtCompileTime() &&
+        static_assert(ABlockDesc_E1_K1_E2::IsKnownAtCompileTime() &&
-                          BlockMatrixB::IsKnownAtCompileTime() &&
+                          BBlockDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
-                          ThreadMatrixC::IsKnownAtCompileTime(),
+                          CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
                      "wrong! Desc should be known at compile-time");
-        constexpr auto I0 = Number<0>{};
+        static_assert(
-        constexpr auto I1 = Number<1>{};
+            ABlockDesc_E1_K1_E2{}.GetLength(I0) == BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I0) &&
-        constexpr auto I2 = Number<2>{};
+                ABlockDesc_E1_K1_E2{}.GetLength(I2) == BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I4),
-        constexpr auto I3 = Number<3>{};
+            "wrong! E dimension not consistent\n");
-        static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
-                      "wrong! K dimension not consistent\n");
-        constexpr index_t K = BlockMatrixA{}.GetLength(I1); // A is transposed
+        static_assert(E1 % EPerThreadLoop == 0, "");
-        constexpr index_t H = BlockMatrixB{}.GetLength(I2);
+        static_assert(KPerThread % KPerThreadLoop == 0, "");
-        constexpr index_t W = BlockMatrixB{}.GetLength(I3);
-        static_assert(K % KPerThread == 0 && H % HPerThread == 0 && W % WPerThread == 0,
+        static_assert(KPerBlock % KPerThread == 0 && HoPerBlock % HoPerThread == 0 &&
+                          WoPerBlock % WoPerThread == 0,
                      "wrong! Cannot evenly divide work among\n");
-        constexpr auto KThreadCluster = K / KPerThread;
+        constexpr auto KThreadCluster = KPerBlock / KPerThread;
-        constexpr auto HThreadCluster = H / HPerThread;
+        constexpr auto HThreadCluster = HoPerBlock / HoPerThread;
-        constexpr auto WThreadCluster = W / WPerThread;
+        constexpr auto WThreadCluster = WoPerBlock / WoPerThread;
        static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster,
                      "wrong! wrong blocksize\n");
    }
-    __device__ static constexpr auto GetThreadMatrixCLengths()
+    __device__ static constexpr auto GetCThreadDesc_K_N_Ho_WoLengths()
    {
-        return Sequence<KPerThread, 1, HPerThread, WPerThread>{};
+        return Sequence<KPerThread, I1, HoPerThread, WoPerThread>{};
    }
-    __device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
+    __device__ static CIndex GetBeginOfCThreadDesc_K_N_Ho_Wo(index_t thread_id)
    {
-        constexpr index_t H = BlockMatrixB{}.GetLength(Number<2>{});
+        constexpr auto K0 = KPerBlock / KPerThread;
-        constexpr index_t W = BlockMatrixB{}.GetLength(Number<3>{});
+        constexpr auto N0 = I1;
+        constexpr auto H0 = HoPerBlock / HoPerThread;
-        constexpr auto num_w_threads  = W / WPerThread;
+        constexpr auto W0 = WoPerBlock / WoPerThread;
-        constexpr auto num_h_threads  = H / HPerThread;
-        constexpr auto num_hw_threads = num_w_threads * num_h_threads;
+        constexpr auto c_threadid_to_k_n_h_w_thread_cluster_adaptor =
+            make_single_stage_tensor_adaptor(
-        index_t k_thread_id  = thread_id / num_hw_threads;
+                make_tuple(make_merge_transform(make_tuple(K0, N0, H0, W0))),
-        index_t hw_thread_id = thread_id % num_hw_threads;
+                make_tuple(Sequence<0, 1, 2, 3>{}),
+                make_tuple(Sequence<0>{}));
-        index_t h_thread_id = hw_thread_id / num_w_threads;
-        index_t w_thread_id = hw_thread_id % num_w_threads;
+        const auto c_k_n_h_w_thread_cluster_idx =
+            c_threadid_to_k_n_h_w_thread_cluster_adaptor.CalculateBottomIndex(
-        return MatrixIndex{k_thread_id, h_thread_id, w_thread_id};
+                make_multi_index(thread_id));
+        return c_k_n_h_w_thread_cluster_idx;
    }
    template <typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer>
@@ -116,19 +116,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
            is_same<remove_cvref_t<typename CThreadBuffer::type>, remove_cvref_t<FloatC>>::value &&
            "wrong! inconsistent type");
-        constexpr auto I0 = Number<0>{};
+        constexpr auto a_block_mtx = ABlockDesc_E1_K1_E2{};
-        constexpr auto a_block_mtx = BlockMatrixA{};
-        constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
-        // HACK: fix this @Jing Zhang
-        constexpr auto HoPerThreadSubC = 2;
-        constexpr auto WoPerThreadSubC = 2;
-        static_assert(KPerThread % KPerThreadSubC == 0, "");
-        static_assert(HPerThread % HoPerThreadSubC == 0, "");
-        static_assert(WPerThread % WoPerThreadSubC == 0, "");
        // thread A buffer for GEMM
        StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatA, a_thread_mtx_.GetElementSpaceSize(), true>
@@ -139,42 +127,46 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
                                                                         FloatC,
                                                                         decltype(a_thread_mtx_),
                                                                         decltype(b_thread_mtx_),
-                                                                         decltype(c_thread_mtx_),
+                                                                         decltype(c_thread_mtx_)>{};
-                                                                         HoPerThreadSubC,
-                                                                         WoPerThreadSubC>{};
-        static_for<0, EPerBlock, EPerThreadLoop>{}([&](auto e_begin) {
+        static_for<0, E1, EPerThreadLoop>{}([&](auto e_begin) {
-            static_for<0, KPerThread, KPerThreadSubC>{}([&](auto k_begin) {
+            static_for<0, KPerThread, KPerThreadLoop>{}([&](auto k_begin) {
                a_thread_copy_.Run(a_block_mtx,
-                                   make_tuple(e_begin, k_begin),
+                                   make_tuple(e_begin, k_begin, I0),
                                   a_block_buf,
                                   a_thread_mtx_,
-                                   make_tuple(I0, I0),
+                                   make_tuple(I0, I0, I0),
                                   a_thread_buf);
-                static_for<0, HPerThread, HoPerThreadSubC>{}([&](auto h_begin) {
-                    static_for<0, WPerThread, WoPerThreadSubC>{}([&](auto w_begin) {
                threadwise_gemm.Run(a_thread_buf,
-                                            make_tuple(I0, I0),
+                                    make_tuple(I0, I0, I0),
                                    b_thread_buf,
-                                            make_tuple(e_begin, I0, h_begin, w_begin),
+                                    make_tuple(e_begin, I0, I0, I0, I0),
                                    c_thread_buf,
-                                            make_tuple(k_begin, I0, h_begin, w_begin));
+                                    make_tuple(k_begin, I0, I0, I0));
-                    });
-                });
            });
        });
    }
    template <typename ABlockSliceMoveStepIdx>
-    __device__ void MoveASliceWindow(const BlockMatrixA&,
+    __device__ void MoveABlockSliceWindow(const ABlockSliceMoveStepIdx& a_block_slice_move_step_idx)
-                                     const ABlockSliceMoveStepIdx& a_block_slice_move_step_idx)
    {
-        a_thread_copy_.MoveSrcSliceWindow(BlockMatrixA{}, a_block_slice_move_step_idx);
+        a_thread_copy_.MoveSrcSliceWindow(ABlockDesc_E1_K1_E2{}, a_block_slice_move_step_idx);
    }
    private:
-    MatrixIndex c_thread_begin_mtx_idx_;
+    using AThreadCopy =
+        ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                         FloatA,
+                                         ABlockDesc_E1_K1_E2,
+                                         decltype(a_thread_mtx_),
+                                         Sequence<EPerThreadLoop, KPerThreadLoop, E2>,
+                                         Sequence<0, 1, 2>,
+                                         2,
+                                         E2,
+                                         E2>;
+    CIndex c_thread_origin_data_idx_;
    AThreadCopy a_thread_copy_;
 };

--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v3.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
@@ -9,21 +9,22 @@ namespace ck {
 // C[M, N] += transpose(A[K, M]) * B[K, N]
 //   Element of matrix can be vectorized data
 // Assume:
-//   1. ADesc, BDesc, CDesc are known at compile-time
+//   1. AThreadDesc_E1_K_E2, BThreadDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo are known at
+//   compile-time
 //   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
 template <typename FloatA,
          typename FloatB,
          typename FloatC,
-          typename ADesc,
+          typename AThreadDesc_E1_K_E2,
-          typename BDesc,
+          typename BThreadDesc_E1_N_Ho_Wo_E2,
-          typename CDesc,
+          typename CThreadDesc_K_N_Ho_Wo,
-          index_t H,
+          typename enable_if<AThreadDesc_E1_K_E2::IsKnownAtCompileTime() &&
-          index_t W,
+                                 BThreadDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
-          typename enable_if<ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
+                                 CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
-                                 CDesc::IsKnownAtCompileTime(),
                             bool>::type = false>
 struct ThreadwiseGemmDlops_km_kn_mn_v3
 {
    template <typename ABuffer,
              typename AOriginIdx,
              typename BBuffer,
@@ -37,8 +38,10 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
                               CBuffer& c_buf,
                               COriginIdx)
    {
-        static_assert(ADesc::IsKnownAtCompileTime() && BDesc::IsKnownAtCompileTime() &&
-                          CDesc::IsKnownAtCompileTime(),
+        static_assert(AThreadDesc_E1_K_E2::IsKnownAtCompileTime() &&
+                          BThreadDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
+                          CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
                      "wrong! Desc should be known at compile-time");
        static_assert(is_known_at_compile_time<remove_cvref_t<AOriginIdx>>::value &&
@@ -54,102 +57,107 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
-        constexpr auto E = ADesc{}.GetLength(I0);
+        constexpr auto E1 = AThreadDesc_E1_K_E2{}.GetLength(I0);
-        constexpr auto K = ADesc{}.GetLength(I1);
+        constexpr auto K  = AThreadDesc_E1_K_E2{}.GetLength(I1);
+        constexpr auto E2 = AThreadDesc_E1_K_E2{}.GetLength(I2);
+        constexpr auto Ho = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I2);
+        constexpr auto Wo = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I3);
        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
-        static_for<0, E, 1>{}([&](auto e) {
+        if constexpr((Ho % 2 == 0) && (Wo % 2 == 0))
+        {
+            constexpr auto SubHW = 2;
            static_for<0, K, 1>{}([&](auto k) {
-                constexpr index_t a_offset =
+                static_for<0, Ho, SubHW>{}([&](auto h) {
-                    ADesc{}.CalculateOffset(a_origin_idx + make_tuple(e, k));
+                    static_for<0, Wo, SubHW>{}([&](auto w) {
+                        static_for<0, E1, 1>{}([&](auto e1) {
+                            static_for<0, E2, 1>{}([&](auto e2) {
+                                constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset(
+                                    a_origin_idx + make_tuple(e1, k, e2));
-                if constexpr(H == 2 && W == 2)
+                                constexpr index_t b0_offset =
-                {
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
-                    constexpr index_t b_offset_0 =
+                                        b_origin_idx + make_tuple(e1, 0, h, w, e2));
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
-                    constexpr index_t b_offset_1 =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 1));
-                    constexpr index_t b_offset_2 =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
-                    constexpr index_t b_offset_3 =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 1));
-                    constexpr index_t c_offset_0 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
-                    constexpr index_t c_offset_1 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 1));
-                    constexpr index_t c_offset_2 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
-                    constexpr index_t c_offset_3 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 1));
-                    amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
+                                constexpr index_t b1_offset =
-                                                   b_buf[Number<b_offset_0>{}],
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
-                                                   b_buf[Number<b_offset_1>{}],
+                                        b_origin_idx + make_tuple(e1, 0, h, w + 1, e2));
-                                                   b_buf[Number<b_offset_2>{}],
-                                                   b_buf[Number<b_offset_3>{}],
+                                constexpr index_t b2_offset =
-                                                   c_buf(Number<c_offset_0>{}),
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
-                                                   c_buf(Number<c_offset_1>{}),
+                                        b_origin_idx + make_tuple(e1, 0, h + 1, w, e2));
-                                                   c_buf(Number<c_offset_2>{}),
-                                                   c_buf(Number<c_offset_3>{}));
+                                constexpr index_t b3_offset =
-                }
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
-                else if constexpr(H == 4 && W == 1)
+                                        b_origin_idx + make_tuple(e1, 0, h + 1, w + 1, e2));
-                {
-                    constexpr index_t b_offset_0 =
+                                constexpr index_t c0_offset =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 0, 0));
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx +
-                    constexpr index_t b_offset_1 =
+                                                                            make_tuple(k, 0, h, w));
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 1, 0));
-                    constexpr index_t b_offset_2 =
+                                constexpr index_t c1_offset =
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 2, 0));
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
-                    constexpr index_t b_offset_3 =
+                                        c_origin_idx + make_tuple(k, 0, h, w + 1));
-                        BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, 3, 0));
+                                constexpr index_t c2_offset =
-                    constexpr index_t c_offset_0 =
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 0, 0));
+                                        c_origin_idx + make_tuple(k, 0, h + 1, w));
-                    constexpr index_t c_offset_1 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 1, 0));
+                                constexpr index_t c3_offset =
-                    constexpr index_t c_offset_2 =
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 2, 0));
+                                        c_origin_idx + make_tuple(k, 0, h + 1, w + 1));
-                    constexpr index_t c_offset_3 =
-                        CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, 3, 0));
                                amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
-                                                   b_buf[Number<b_offset_0>{}],
+                                                               b_buf[Number<b0_offset>{}],
-                                                   b_buf[Number<b_offset_1>{}],
+                                                               b_buf[Number<b1_offset>{}],
-                                                   b_buf[Number<b_offset_2>{}],
+                                                               b_buf[Number<b2_offset>{}],
-                                                   b_buf[Number<b_offset_3>{}],
+                                                               b_buf[Number<b3_offset>{}],
-                                                   c_buf(Number<c_offset_0>{}),
+                                                               c_buf(Number<c0_offset>{}),
-                                                   c_buf(Number<c_offset_1>{}),
+                                                               c_buf(Number<c1_offset>{}),
-                                                   c_buf(Number<c_offset_2>{}),
+                                                               c_buf(Number<c2_offset>{}),
-                                                   c_buf(Number<c_offset_3>{}));
+                                                               c_buf(Number<c3_offset>{}));
+                            });
+                        });
+                    });
+                });
+            });
        }
        else
        {
-                    static_for<0, H, 1>{}([&](auto h) {
-                        static_for<0, W, 1>{}([&](auto w) {
+            static_for<0, K, 1>{}([&](auto k) {
+                static_for<0, Ho, 1>{}([&](auto h) {
+                    static_for<0, Wo, 1>{}([&](auto w) {
+                        static_for<0, E1, 1>{}([&](auto e1) {
+                            static_for<0, E2, 1>{}([&](auto e2) {
+                                constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset(
+                                    a_origin_idx + make_tuple(e1, k, e2));
                                constexpr index_t b_offset =
-                                BDesc{}.CalculateOffset(b_origin_idx + make_tuple(e, 0, h, w));
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                        b_origin_idx + make_tuple(e1, 0, h, w, e2));
                                constexpr index_t c_offset =
-                                CDesc{}.CalculateOffset(c_origin_idx + make_tuple(k, 0, h, w));
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx +
+                                                                            make_tuple(k, 0, h, w));
-#if 0
+                                inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
-                            c_buf(Number<c_offset>{}) += inner_product_with_conversion<FloatC>{}(
-                                a_buf[Number<a_offset>{}], b_buf[Number<b_offset>{}]);
-#else
-                            amd_assembly_inner_product(a_buf[Number<a_offset>{}],
                                                                      b_buf[Number<b_offset>{}],
                                                                      c_buf(Number<c_offset>{}));
-#endif
                            });
                        });
-                }
                    });
                });
+            });
+        }
    }
 };

--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
@@ -217,6 +217,22 @@ struct ThreadwiseTensorSliceTransfer_v1r3
                    is_dst_valid,
                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
            }
+            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add)
+            {
+                typename vector_type_maker<DstData, DstScalarPerVector>::type tmp;
+                tmp.template AsType<dst_vector_t>()(Number<0>{}) =
+                    dst_buf.template Get<dst_vector_t>(dst_coord_.GetOffset(), is_dst_valid);
+                static_for<0, DstScalarPerVector, 1>{}([&](auto t) {
+                    dst_vector.template AsType<DstData>()(t) += tmp.template AsType<DstData>()[t];
+                });
+                dst_buf.template Set<dst_vector_t>(
+                    dst_coord_.GetOffset(),
+                    is_dst_valid,
+                    dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+            }
            constexpr auto move_on_dim = [&]() constexpr
            {
@@ -666,6 +682,25 @@ struct ThreadwiseTensorSliceTransfer_v2
        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
    }
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& src_slice_origin_step_idx,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
    private:
    SrcCoord src_coord_;
 }; // namespace ck

--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -268,14 +268,14 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
            const float2_t tmp = llvm_amdgcn_raw_buffer_load_fp32x2(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
-            return as_type<double>(tmp);
+            return bit_cast<double>(tmp);
        }
        else if constexpr(N == 2)
        {
            const float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
-            return as_type<double2_t>(tmp);
+            return bit_cast<double2_t>(tmp);
        }
        else if constexpr(N == 4)
        {
@@ -289,8 +289,8 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
                                                   0);
            vector_type<double, 4> tmp;
-            tmp.AsType<double2_t>()(Number<0>{}) = as_type<double2_t>(f32_0);
+            tmp.AsType<double2_t>()(Number<0>{}) = bit_cast<double2_t>(f32_0);
-            tmp.AsType<double2_t>()(Number<1>{}) = as_type<double2_t>(f32_1);
+            tmp.AsType<double2_t>()(Number<1>{}) = bit_cast<double2_t>(f32_1);
            return tmp.AsType<double4_t>()(Number<0>{});
        }
@@ -351,7 +351,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
-            return as_type<half8_t>(tmp);
+            return bit_cast<half8_t>(tmp);
        }
    }
    else if constexpr(is_same<T, ushort>::value)
@@ -376,7 +376,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
-            return as_type<ushort8_t>(tmp);
+            return bit_cast<ushort8_t>(tmp);
        }
    }
    else if constexpr(is_same<T, int32_t>::value)
@@ -427,7 +427,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
-            return as_type<int8x2_t>(tmp);
+            return bit_cast<int8x2_t>(tmp);
 #endif
        }
        else if constexpr(N == 4)
@@ -439,7 +439,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
-            return as_type<int8x4_t>(tmp);
+            return bit_cast<int8x4_t>(tmp);
 #endif
        }
        else if constexpr(N == 8)
@@ -461,7 +461,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
-            return as_type<int8x8_t>(tmp);
+            return bit_cast<int8x8_t>(tmp);
 #endif
        }
        else if constexpr(N == 16)
@@ -495,7 +495,7 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
-            return as_type<int8x16_t>(tmp);
+            return bit_cast<int8x16_t>(tmp);
 #endif
        }
    }
@@ -521,7 +521,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
        // use fp32 store to mimic fp64 store
        if constexpr(N == 1)
        {
-            llvm_amdgcn_raw_buffer_store_fp32x2(as_type<float2_t>(src_thread_data),
+            llvm_amdgcn_raw_buffer_store_fp32x2(bit_cast<float2_t>(src_thread_data),
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
@@ -529,7 +529,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
        }
        else if constexpr(N == 2)
        {
-            llvm_amdgcn_raw_buffer_store_fp32x4(as_type<float4_t>(src_thread_data),
+            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<float4_t>(src_thread_data),
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
@@ -591,6 +591,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
        }
        else if constexpr(N == 8)
        {
+#if 0
            vector_type<half_t, 8> tmp{src_thread_data};
            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
@@ -604,6 +605,13 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset + 4 * sizeof(half_t),
                                                0);
+#else
+            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<float4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+#endif
        }
    }
    else if constexpr(is_same<T, ushort>::value)
@@ -695,7 +703,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                              dst_wave_addr_offset,
                                              0);
 #else
-            llvm_amdgcn_raw_buffer_store_i16(as_type<int16_t>(src_thread_data),
+            llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
@@ -711,7 +719,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
                                              dst_wave_addr_offset,
                                              0);
 #else
-            llvm_amdgcn_raw_buffer_store_i32(as_type<int32_t>(src_thread_data),
+            llvm_amdgcn_raw_buffer_store_i32(bit_cast<int32_t>(src_thread_data),
                                             dst_wave_buffer_resource,
                                             dst_thread_addr_offset,
                                             dst_wave_addr_offset,
@@ -720,7 +728,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
        }
        else if constexpr(N == 8)
        {
-            llvm_amdgcn_raw_buffer_store_i32x2(as_type<int32x2_t>(src_thread_data),
+            llvm_amdgcn_raw_buffer_store_i32x2(bit_cast<int32x2_t>(src_thread_data),
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,
@@ -728,7 +736,7 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
        }
        else if constexpr(N == 16)
        {
-            llvm_amdgcn_raw_buffer_store_i32x4(as_type<int32x4_t>(src_thread_data),
+            llvm_amdgcn_raw_buffer_store_i32x4(bit_cast<int32x4_t>(src_thread_data),
                                               dst_wave_buffer_resource,
                                               dst_thread_addr_offset,
                                               dst_wave_addr_offset,

--- a/composable_kernel/include/utility/amd_inline_asm.hpp
+++ b/composable_kernel/include/utility/amd_inline_asm.hpp
@@ -211,14 +211,14 @@ amd_assembly_outer_product_1x2(int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t& c0
            v_dot4_i32_i8 %1, %2, %4, %1\n \
            "
                 : "=v"(c0), "=v"(c1)
-                 : "v"(as_type<int32_t>(a)),
+                 : "v"(bit_cast<int32_t>(a)),
-                   "v"(as_type<int32_t>(b0)),
+                   "v"(bit_cast<int32_t>(b0)),
-                   "v"(as_type<int32_t>(b1)),
+                   "v"(bit_cast<int32_t>(b1)),
                   "0"(c0),
                   "1"(c1));
 #else
-    c0 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b0), c0, false);
+    c0 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b0), c0, false);
-    c1 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b1), c1, false);
+    c1 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b1), c1, false);
 #endif
 }
@@ -244,20 +244,20 @@ __device__ void amd_assembly_outer_product_1x4(int8x4_t a,
            v_dot4_i32_i8 %3, %4, %8, %3\n \
            "
                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
-                 : "v"(as_type<int32_t>(a)),
+                 : "v"(bit_cast<int32_t>(a)),
-                   "v"(as_type<int32_t>(b0)),
+                   "v"(bit_cast<int32_t>(b0)),
-                   "v"(as_type<int32_t>(b1)),
+                   "v"(bit_cast<int32_t>(b1)),
-                   "v"(as_type<int32_t>(b2)),
+                   "v"(bit_cast<int32_t>(b2)),
-                   "v"(as_type<int32_t>(b3)),
+                   "v"(bit_cast<int32_t>(b3)),
                   "0"(c0),
                   "1"(c1),
                   "2"(c2),
                   "3"(c3));
 #else
-    c0 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b0), c0, false);
+    c0 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b0), c0, false);
-    c1 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b1), c1, false);
+    c1 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b1), c1, false);
-    c2 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b2), c2, false);
+    c2 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b2), c2, false);
-    c3 = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b3), c3, false);
+    c3 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b3), c3, false);
 #endif
 }

--- a/composable_kernel/include/utility/amd_xdlops.hpp
+++ b/composable_kernel/include/utility/amd_xdlops.hpp
@@ -340,8 +340,8 @@ struct intrin_mfma_i32_32x32x8i8<32, 32>
    __device__ static void Run(const int8x4_t& reg_a, const int8x4_t& reg_b, FloatC& reg_c)
    {
        reg_c.template AsType<int32x16_t>()(Number<0>{}) =
-            llvm_intrin_amdgcn_mfma_i32_32x32x8i8(as_type<int>(reg_a),
+            llvm_intrin_amdgcn_mfma_i32_32x32x8i8(bit_cast<int>(reg_a),
-                                                  as_type<int>(reg_b),
+                                                  bit_cast<int>(reg_b),
                                                  reg_c.template AsType<int32x16_t>()[Number<0>{}],
                                                  0,
                                                  0,
@@ -359,8 +359,8 @@ struct intrin_mfma_i32_16x16x16i8<16, 16>
    __device__ static void Run(const int8x4_t& reg_a, const int8x4_t& reg_b, FloatC& reg_c)
    {
        reg_c.template AsType<int32x4_t>()(Number<0>{}) =
-            llvm_intrin_amdgcn_mfma_i32_16x16x16i8(as_type<int>(reg_a),
+            llvm_intrin_amdgcn_mfma_i32_16x16x16i8(bit_cast<int>(reg_a),
-                                                   as_type<int>(reg_b),
+                                                   bit_cast<int>(reg_b),
                                                   reg_c.template AsType<int32x4_t>()[Number<0>{}],
                                                   0,
                                                   0,

--- a/composable_kernel/include/utility/config.hpp
+++ b/composable_kernel/include/utility/config.hpp
@@ -96,9 +96,22 @@
 // pass tensor descriptor by value or void*
 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
+#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
 // merge transformation use magic number division
+#ifndef CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION
 #define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
+#endif
+// use __builtin_memcpy instead of pointer cast to access a vector from pointer of scalar
+#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
+#endif
+// use __builtin_memcpy instead of union to do bit_cast
+#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST
+#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
+#endif
 // hack: have underlying assumption that need to be satsified, otherwise it's a bug
 // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
@@ -118,7 +131,7 @@
 #define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1
 #endif
-// workaround for compiler crash when using buffer load/store for i8
+// workaround for compiler gnerating inefficient ds_write instructions
 #ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
 #define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
 #endif
@@ -133,7 +146,15 @@ namespace ck {
 enum InMemoryDataOperationEnum_t
 {
    Set,
-    AtomicAdd
+    AtomicAdd,
+    Add
+};
+enum ActivTypeEnum_t
+{
+    None = 0,
+    LeakyRelu,
+    Sigmoid
 };
 // index type

--- a/composable_kernel/include/utility/data_type.hpp
+++ b/composable_kernel/include/utility/data_type.hpp
@@ -1081,11 +1081,11 @@ struct NumericLimits<half_t>
    static constexpr unsigned short binary_max    = 0x7BFF;
    static constexpr unsigned short binary_lowest = 0xFBFF;
-    __host__ __device__ static constexpr half_t Min() { return as_type<half_t>(binary_min); }
+    __host__ __device__ static constexpr half_t Min() { return bit_cast<half_t>(binary_min); }
-    __host__ __device__ static constexpr half_t Max() { return as_type<half_t>(binary_max); }
+    __host__ __device__ static constexpr half_t Max() { return bit_cast<half_t>(binary_max); }
-    __host__ __device__ static constexpr half_t Lowest() { return as_type<half_t>(binary_lowest); }
+    __host__ __device__ static constexpr half_t Lowest() { return bit_cast<half_t>(binary_lowest); }
 };
 } // namespace ck

--- a/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -83,12 +83,28 @@ struct DynamicBuffer
        {
            if constexpr(InvalidElementUseNumericalZeroValue)
            {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+                X tmp;
+                __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+                return is_valid_element ? tmp : X{0};
+#else
                return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i]) : X{0};
+#endif
            }
            else
            {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+                X tmp;
+                __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+                return is_valid_element ? tmp : X{invalid_element_value_};
+#else
                return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i])
                                        : X{invalid_element_value_};
+#endif
            }
        }
    }
@@ -117,7 +133,13 @@ struct DynamicBuffer
 #else
            if(is_valid_element)
            {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+                X tmp = x;
+                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+#else
                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+#endif
            }
 #endif
        }
@@ -126,7 +148,13 @@ struct DynamicBuffer
            if(is_valid_element)
            {
 #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+                X tmp = x;
+                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+#else
                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+#endif
 #else
                // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
                // inefficient
@@ -201,7 +229,13 @@ struct DynamicBuffer
                }
                else
                {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+                    X tmp = x;
+                    __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+#else
                    *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+#endif
                }
 #endif
            }
@@ -210,7 +244,13 @@ struct DynamicBuffer
        {
            if(is_valid_element)
            {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+                X tmp = x;
+                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+#else
                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+#endif
            }
        }
    }

--- a/composable_kernel/include/utility/inner_product.hpp
+++ b/composable_kernel/include/utility/inner_product.hpp
@@ -144,9 +144,9 @@ inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b,
            v_dot4_i32_i8 %0, %1, %2, %0\n \
            "
                 : "=v"(c)
-                 : "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
+                 : "v"(bit_cast<int32_t>(a)), "v"(bit_cast<int32_t>(b)), "0"(c));
 #else
-    c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
+    c = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b), c, false);
 #endif
 #else
    const vector_type<int8_t, 4> a_vector{a};

--- a/composable_kernel/include/utility/magic_division.hpp
+++ b/composable_kernel/include/utility/magic_division.hpp
@@ -125,7 +125,7 @@ struct MagicDivision
    __host__ __device__ static constexpr int32_t
    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
    {
-        uint32_t dividend_u32 = as_type<uint32_t>(dividend_i32);
+        uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
        uint32_t tmp          = __umulhi(dividend_u32, multiplier);
        return (tmp + dividend_u32) >> shift;
    }

--- a/composable_kernel/include/utility/statically_indexed_array.hpp
+++ b/composable_kernel/include/utility/statically_indexed_array.hpp
@@ -54,5 +54,49 @@ __host__ __device__ constexpr auto make_statically_indexed_array()
    return StaticallyIndexedArray<X, 0>();
 }
+template <typename T, index_t N>
+struct StaticallyIndexedArray_v2
+{
+    __host__ __device__ constexpr StaticallyIndexedArray_v2() = default;
+    __host__ __device__ static constexpr index_t Size() { return N; }
+    // read access
+    template <index_t I>
+    __host__ __device__ constexpr const auto& At(Number<I>) const
+    {
+        static_assert(I < N, "wrong! out of range");
+        return data_[I];
+    }
+    // write access
+    template <index_t I>
+    __host__ __device__ constexpr auto& At(Number<I>)
+    {
+        static_assert(I < N, "wrong! out of range");
+        return data_[I];
+    }
+    // read access
+    template <index_t I>
+    __host__ __device__ constexpr const auto& operator[](Number<I> i) const
+    {
+        return At(i);
+    }
+    // write access
+    template <index_t I>
+    __host__ __device__ constexpr auto& operator()(Number<I> i)
+    {
+        return At(i);
+    }
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
+    T data_[N];
+};
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/type.hpp
+++ b/composable_kernel/include/utility/type.hpp
@@ -32,8 +32,15 @@ template <typename T>
 inline constexpr bool is_pointer_v = std::is_pointer<T>::value;
 template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
-__host__ __device__ constexpr Y as_type(X x)
+__host__ __device__ constexpr Y bit_cast(const X& x)
 {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST
+    Y y;
+    __builtin_memcpy(&y, &x, sizeof(X));
+    return y;
+#else
    union AsType
    {
        X x;
@@ -41,6 +48,7 @@ __host__ __device__ constexpr Y as_type(X x)
    };
    return AsType{x}.y;
+#endif
 }
 } // namespace ck

--- a/example/1_gemm_xdl/gemm_xdl.cpp
+++ b/example/1_gemm_xdl/gemm_xdl.cpp
@@ -9,7 +9,6 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
-#include "gemm_common.hpp"
 #include "host_gemm.hpp"
 #include "device_tensor.hpp"
 #include "device_base.hpp"
@@ -139,12 +138,12 @@ int main(int argc, char* argv[])
    {
    case 0: break;
    case 1:
-        a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
        break;
    default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0});
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
    }
    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());

--- a/host/driver_offline/CMakeLists.txt
+++ b/host/driver_offline/CMakeLists.txt
@@ -13,16 +13,25 @@ include_directories(BEFORE
 )
 set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp)
+set(CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_fwd_driver_offline_nchwc.cpp)
+set(CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_add_fwd_driver_offline_nchwc.cpp)
+set(CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE src/conv_maxpool_fwd_driver_offline_nchwc.cpp)
 set(CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp)
 set(CONV_WRW_DRIVER_OFFLINE_SOURCE src/conv_wrw_driver_offline.cpp)
 set(GEMM_DRIVER_OFFLINE_SOURCE src/gemm_driver_offline.cpp)
 add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
+add_executable(conv_fwd_driver_offline_nchwc ${CONV_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
+add_executable(conv_add_fwd_driver_offline_nchwc ${CONV_ADD_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
+add_executable(conv_maxpool_fwd_driver_offline_nchwc ${CONV_MAXPOOL_FWD_DRIVER_OFFLINE_NCHWC_SOURCE})
 add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
 add_executable(conv_wrw_driver_offline ${CONV_WRW_DRIVER_OFFLINE_SOURCE})
 add_executable(gemm_driver_offline ${GEMM_DRIVER_OFFLINE_SOURCE})
 target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor)
+target_link_libraries(conv_fwd_driver_offline_nchwc PRIVATE host_tensor)
+target_link_libraries(conv_add_fwd_driver_offline_nchwc PRIVATE host_tensor)
+target_link_libraries(conv_maxpool_fwd_driver_offline_nchwc PRIVATE host_tensor)
 target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor)
 target_link_libraries(conv_wrw_driver_offline PRIVATE host_tensor)
 target_link_libraries(gemm_driver_offline PRIVATE host_tensor)
--- a/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          ck::ActivTypeEnum_t activ_type,
+          typename InLengths,
+          typename WeiLengths,
+          typename AddLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1(
+    const InLengths& in_n_c0_hi_wi_c1_lengths,
+    const WeiLengths& wei_k_c0_y_x_c1_lengths,
+    const AddLengths& add_n_k0_hox2_wox2_k1_lengths,
+    const OutLengths& out_n_k0_ho_wo_k1_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c0_hi_wi_c1,
+    const Tensor<TInWei>& wei_k_c0_y_x_c1,
+    const Tensor<TOut>& bias_k0_k1,
+    const Tensor<TOut>& add_n_k0_hox2_wox2_k1,
+    Tensor<TOut>& add_n_k0_hox2_wox2_k1_out,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    const auto N  = out_n_k0_ho_wo_k1_lengths[I0];
+    const auto K0 = out_n_k0_ho_wo_k1_lengths[I1];
+    const auto Ho = out_n_k0_ho_wo_k1_lengths[I2];
+    const auto Wo = out_n_k0_ho_wo_k1_lengths[I3];
+    const auto K1 = out_n_k0_ho_wo_k1_lengths[I4];
+    const auto C0 = in_n_c0_hi_wi_c1_lengths[I1];
+    const auto Hi = in_n_c0_hi_wi_c1_lengths[I2];
+    const auto Wi = in_n_c0_hi_wi_c1_lengths[I3];
+    const auto C1 = in_n_c0_hi_wi_c1_lengths[I4];
+    const auto K = wei_k_c0_y_x_c1_lengths[I0];
+    const auto Y = wei_k_c0_y_x_c1_lengths[I2];
+    const auto X = wei_k_c0_y_x_c1_lengths[I3];
+    const auto Hox2 = add_n_k0_hox2_wox2_k1_lengths[I2];
+    const auto Wox2 = add_n_k0_hox2_wox2_k1_lengths[I3];
+    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
+                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
+    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
+    DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
+    DeviceMem add_n_k0_hox2_wox2_k1_device_buf(sizeof(TOut) *
+                                               add_n_k0_hox2_wox2_k1.mDesc.GetElementSpace());
+    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
+    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+    bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
+    add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data());
+    constexpr index_t InWeiVectorSize = 8;
+    if(C1 % InWeiVectorSize != 0)
+    {
+        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
+    }
+#if 0
+    constexpr index_t BlockSize = 256;
+    constexpr index_t KPerBlock  = 32;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 64;
+    constexpr index_t E1        = C0 * 9;
+    constexpr index_t E2        = 1;
+    constexpr index_t E1PerBlock = C0;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = 1;
+    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2   = Sequence<1, 9, 1, E2>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
+    constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
+    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
+#elif 1
+    constexpr auto BlockSize = 64;
+    constexpr auto KPerBlock  = 8;
+    constexpr auto HoPerBlock = 8;
+    constexpr auto WoPerBlock = 32;
+    constexpr auto E1         = 2 * 9;
+    constexpr auto E2         = 1;
+    constexpr auto K2         = 2;
+    constexpr auto E1PerBlock = 2;
+    constexpr auto KPerThread  = KPerBlock;
+    constexpr auto HoPerThread = 2;
+    constexpr auto WoPerThread = 2;
+    constexpr auto EPerThread  = 1;
+    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
+        Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
+    constexpr auto ABlockTransferSrcScalarPerVector_E2  = E2;
+    constexpr auto ABlockTransferDstScalarPerVector_E2  = E2;
+    constexpr auto BThreadTransferSrcScalarPerVector_E2 = E2;
+    constexpr auto CThreadTransferDstScalarPerVector_K  = InWeiVectorSize;
+#endif
+    const auto in_n_c0_hi_wi_c1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
+    const auto wei_k_c0_y_x_c1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
+    const auto add_n_k0_hox2_wox2_k1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Hox2, Wox2, K1));
+    const auto out_n_k0_ho_wo_k1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
+    constexpr auto conv_driver =
+        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_add<
+            BlockSize,
+            typename vector_type<TInWei, InWeiVectorSize>::type,
+            TAcc,
+            TOut,
+            E1,
+            E2,
+            K2,
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            E1PerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+            ABlockTransferSrcScalarPerVector_E2,
+            ABlockTransferDstScalarPerVector_E2,
+            BThreadTransferSrcScalarPerVector_E2,
+            CThreadTransferDstScalarPerVector_K,
+            activ_type>{};
+    std::cerr << "conv_bias_activ_resize_add_input_"
+              << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
+              << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_addout_n" << N << "k" << K0
+              << "h" << Ho * 2 << "w" << Wo * 2 << "k" << K1 << std::endl;
+    for(int i = 0; i < 5; i++)
+    {
+        const auto ave_time =
+            conv_driver.Run(wei_k_c0_y_x_c1_desc,
+                            in_n_c0_hi_wi_c1_desc,
+                            out_n_k0_ho_wo_k1_desc,
+                            add_n_k0_hox2_wox2_k1_desc,
+                            conv_strides,
+                            conv_dilations,
+                            in_left_pads,
+                            in_right_pads,
+                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                                wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                                in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
+                            nrepeat);
+        {
+            float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+    add_n_k0_hox2_wox2_k1_device_buf.ToDevice(add_n_k0_hox2_wox2_k1.mData.data());
+    conv_driver.Run(wei_k_c0_y_x_c1_desc,
+                    in_n_c0_hi_wi_c1_desc,
+                    out_n_k0_ho_wo_k1_desc,
+                    add_n_k0_hox2_wox2_k1_desc,
+                    conv_strides,
+                    conv_dilations,
+                    in_left_pads,
+                    in_right_pads,
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                        in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                    static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
+                    static_cast<TOut*>(add_n_k0_hox2_wox2_k1_device_buf.GetDeviceBuffer()),
+                    0);
+    add_n_k0_hox2_wox2_k1_device_buf.FromDevice(add_n_k0_hox2_wox2_k1_out.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          ck::ActivTypeEnum_t activ_type,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1(
+    const InLengths& in_n_c0_hi_wi_c1_lengths,
+    const WeiLengths& wei_k_c0_y_x_c1_lengths,
+    const OutLengths& out_n_k0_ho_wo_k1_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c0_hi_wi_c1,
+    const Tensor<TInWei>& wei_k_c0_y_x_c1,
+    const Tensor<TOut>& bias_k0_k1,
+    Tensor<TOut>& out_n_k0_ho_wo_k1,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+    std::cout << __func__ << std::endl;
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    const auto N  = out_n_k0_ho_wo_k1_lengths[I0];
+    const auto K0 = out_n_k0_ho_wo_k1_lengths[I1];
+    const auto Ho = out_n_k0_ho_wo_k1_lengths[I2];
+    const auto Wo = out_n_k0_ho_wo_k1_lengths[I3];
+    const auto K1 = out_n_k0_ho_wo_k1_lengths[I4];
+    const auto C0 = in_n_c0_hi_wi_c1_lengths[I1];
+    const auto Hi = in_n_c0_hi_wi_c1_lengths[I2];
+    const auto Wi = in_n_c0_hi_wi_c1_lengths[I3];
+    const auto C1 = in_n_c0_hi_wi_c1_lengths[I4];
+    const auto K = wei_k_c0_y_x_c1_lengths[I0];
+    const auto Y = wei_k_c0_y_x_c1_lengths[I2];
+    const auto X = wei_k_c0_y_x_c1_lengths[I3];
+    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
+                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
+    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
+    DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
+    DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
+                                           out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
+    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
+    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
+    bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
+    constexpr index_t InWeiVectorSize = 8;
+    if(C1 % InWeiVectorSize != 0)
+    {
+        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
+    }
+#if 0
+    constexpr index_t BlockSize = 256;
+    constexpr index_t KPerBlock  = 32;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 64;
+    constexpr index_t E1        = C0 * 9;
+    constexpr index_t E2        = 1;
+    constexpr index_t E1PerBlock = C0;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = 1;
+    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2   = Sequence<1, 9, 1, E2>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 = Sequence<1, E1PerBlock, KPerBlock, 1>;
+    constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
+    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t CThreadTransferDstScalarPerVector_K = K1;
+#elif 1
+    constexpr index_t BlockSize = 64;
+    constexpr index_t KPerBlock  = 8;
+    constexpr index_t HoPerBlock = 8;
+    constexpr index_t WoPerBlock = 32;
+    constexpr index_t E1         = 2 * 9;
+    constexpr index_t E2         = 1;
+    constexpr index_t K2         = 2;
+    constexpr index_t E1PerBlock = 2;
+    constexpr index_t KPerThread  = KPerBlock;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+    constexpr index_t EPerThread  = 1;
+    using ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2 = Sequence<1, 9, 1, 1, E2>;
+    using ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2 =
+        Sequence<1, E1PerBlock, 1, KPerBlock, 1>;
+    constexpr index_t ABlockTransferSrcScalarPerVector_E2  = E2;
+    constexpr index_t ABlockTransferDstScalarPerVector_E2  = E2;
+    constexpr index_t BThreadTransferSrcScalarPerVector_E2 = E2;
+    constexpr index_t CThreadTransferDstScalarPerVector_K  = InWeiVectorSize;
+#endif
+    if(KPerThread % InWeiVectorSize != 0)
+    {
+        throw std::runtime_error("wrong! C1 cannot be divided by InWeiVectorSize");
+    }
+    const auto in_n_c0_hi_wi_c1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi, E2));
+    const auto wei_k_c0_y_x_c1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X, E2));
+    const auto out_n_k0_ho_wo_k1_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
+    constexpr auto conv_driver =
+        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad<
+            BlockSize,
+            typename vector_type<TInWei, InWeiVectorSize>::type,
+            TAcc,
+            TOut,
+            E1,
+            E2,
+            K2,
+            KPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            E1PerBlock,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            EPerThread,
+            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+            ABlockTransferSrcScalarPerVector_E2,
+            ABlockTransferDstScalarPerVector_E2,
+            BThreadTransferSrcScalarPerVector_E2,
+            CThreadTransferDstScalarPerVector_K,
+            activ_type>{};
+    std::cerr << "conv_bias_activ_input_"
+              << "n" << N << "c" << C0 << "h" << Hi << "w" << Wi << "c" << C1 << "_filter_k" << K
+              << "c" << C0 << "y" << Y << "x" << X << "c" << C1 << "_convout_n" << N << "k" << K0
+              << "h" << Ho << "w" << Wo << "k" << K1 << std::endl;
+    for(int i = 0; i < 5; i++)
+    {
+        const auto ave_time =
+            conv_driver.Run(wei_k_c0_y_x_c1_desc,
+                            in_n_c0_hi_wi_c1_desc,
+                            out_n_k0_ho_wo_k1_desc,
+                            conv_strides,
+                            conv_dilations,
+                            in_left_pads,
+                            in_right_pads,
+                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                                wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
+                                in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
+                            static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
+                            nrepeat);
+        {
+            float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C0 * C1 * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+    out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
+}
--- a/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+++ b/host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
-#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
-template <typename TInWei,
-          ck::index_t InWeiVectorSize,
-          typename TAcc,
-          typename TOut,
-          typename InLengths,
-          typename WeiLengths,
-          typename OutLengths,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
-    const InLengths& in_n_c_hi_wi_lengths,
-    const WeiLengths& wei_k_c_y_x_lengths,
-    const OutLengths& out_n_k_ho_wo_lengths,
-    const ConvStrides& conv_strides,
-    const ConvDilations& conv_dilations,
-    const InLeftPads& in_left_pads,
-    const InRightPads& in_right_pads,
-    const Tensor<TInWei>& in_n_c_hi_wi,
-    const Tensor<TInWei>& wei_k_c_y_x,
-    Tensor<TOut>& out_n_k_ho_wo,
-    ck::index_t /* nrepeat */)
-{
-    using namespace ck;
-    std::cout << __func__ << std::endl;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    const auto N = out_n_k_ho_wo_lengths[I0];
-    const auto K = out_n_k_ho_wo_lengths[I1];
-    const auto C = wei_k_c_y_x_lengths[I1];
-    const auto Hi = in_n_c_hi_wi_lengths[I2];
-    const auto Wi = in_n_c_hi_wi_lengths[I3];
-    const auto Ho = out_n_k_ho_wo_lengths[I2];
-    const auto Wo = out_n_k_ho_wo_lengths[I3];
-    const auto Y = wei_k_c_y_x_lengths[I2];
-    const auto X = wei_k_c_y_x_lengths[I3];
-    const auto C0 = C / Number<InWeiVectorSize>{};
-    const auto C1 = Number<InWeiVectorSize>{};
-    const auto K0 = K / Number<InWeiVectorSize>{};
-    const auto K1 = Number<InWeiVectorSize>{};
-    Tensor<TInWei> in_n_c0_hi_wi_c1(
-        HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
-    Tensor<TInWei> wei_k_c0_y_x_c1(
-        HostTensorDescriptor(std::initializer_list<index_t>{K, C0, Y, X, C1}));
-    Tensor<TOut> out_n_k0_ho_wo_k1(
-        HostTensorDescriptor(std::initializer_list<index_t>{N, K0, Ho, Wo, K1}));
-    auto f_nchw2nc0hwc1 = [&](auto n, auto hi, auto wi, auto c) {
-        in_n_c0_hi_wi_c1(n, c / InWeiVectorSize, hi, wi, c % InWeiVectorSize) =
-            in_n_c_hi_wi(n, c, hi, wi);
-    };
-    auto f_kcyx2kc0yxc1 = [&](auto k, auto y, auto x, auto c) {
-        wei_k_c0_y_x_c1(k, c / InWeiVectorSize, y, x, c % InWeiVectorSize) =
-            wei_k_c_y_x(k, c, y, x);
-    };
-    make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
-    make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
-    DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
-                                          in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
-    DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
-    DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
-                                           out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
-    in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
-    wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
-    const auto in_n_c0_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(N, C0, Hi, Wi));
-    const auto wei_k_c0_y_x_desc  = make_naive_tensor_descriptor_packed(make_tuple(K, C0, Y, X));
-    const auto out_n_k0_ho_wo_k1_desc =
-        make_naive_tensor_descriptor_packed(make_tuple(N, K0, Ho, Wo, K1));
-#if 1
-    // cdata = 64, BlockSize = 64, 16x8x32x4
-    constexpr index_t BlockSize = 64;
-    constexpr index_t KPerBlock  = 16;
-    constexpr index_t HoPerBlock = 8;
-    constexpr index_t WoPerBlock = 32;
-    constexpr index_t EPerBlock  = 1;
-    constexpr index_t KPerThread  = KPerBlock;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t EPerThread  = EPerBlock;
-    using ABlockTransferThreadSliceLengths_E_K   = Sequence<3, 1>;
-    using ABlockTransferThreadClusterLengths_E_K = Sequence<3 * EPerBlock, KPerBlock>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
-    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
-    constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
-    constexpr index_t CThreadTransferDstScalarPerVector_W = 16;
-    static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
-#else
-    constexpr index_t BlockSize = 64;
-    constexpr index_t KPerBlock  = 16;
-    constexpr index_t HoPerBlock = 8;
-    constexpr index_t WoPerBlock = 32;
-    constexpr index_t EPerBlock  = 1;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-    constexpr index_t EPerThread  = EPerBlock;
-    using ABlockTransferThreadSliceLengths_E_K   = Sequence<9, 1>;
-    using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, 16>;
-    constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
-    constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
-    constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
-    constexpr index_t CThreadTransferDstScalarPerVector_W = K1;
-    static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
-#endif
-    constexpr auto conv_driver =
-#if 0
-        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
-#else
-        DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outpad
-#endif
-        <BlockSize,
-         typename vector_type<TInWei, InWeiVectorSize>::type,
-         TAcc,
-         TOut,
-         KPerBlock,
-         HoPerBlock,
-         WoPerBlock,
-         EPerBlock,
-         KPerThread,
-         HoPerThread,
-         WoPerThread,
-         EPerThread,
-         ABlockTransferThreadSliceLengths_E_K,
-         ABlockTransferThreadClusterLengths_E_K,
-         ABlockTransferSrcScalarPerVector_E,
-         ABlockTransferDstScalarPerVector_K,
-         BThreadTransferSrcScalarPerVector_W,
-         CThreadTransferDstScalarPerVector_W>{};
-    conv_driver.Run(wei_k_c0_y_x_desc,
-                    in_n_c0_hi_wi_desc,
-                    out_n_k0_ho_wo_k1_desc,
-                    conv_strides,
-                    conv_dilations,
-                    in_left_pads,
-                    in_right_pads,
-                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
-                        wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
-                    static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
-                        in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
-                    static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()));
-    out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
-    auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) {
-        out_n_k_ho_wo(n, k, ho, wo) =
-            out_n_k0_ho_wo_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize);
-    };
-    make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Ho, Wo)();
-}