Merge branch 'direct_fp16'

fdaaaa50 · Chao Liu · 2c9b8c24 · 18a81e35 · fdaaaa50 · fdaaaa50
Commit fdaaaa50 authored Mar 22, 2019 by Chao Liu
20 changed files
--- a/driver/device_direct_convolution_2.hpp
+++ b/driver/device_direct_convolution_2.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_direct_convolution_2.hip.hpp"
+#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp"

 template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_direct_convolution_2(InDesc,
+void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
                                                const Tensor<T>& in,
                                                WeiDesc,
                                                const Tensor<T>& wei,
@@ -46,6 +46,27 @@ void device_direct_convolution_2(InDesc,
    constexpr unsigned HoPerThread = 2;
    constexpr unsigned WoPerThread = 2;

+    constexpr unsigned InBlockCopyDataPerRead  = 2;
+    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // 3x3, 34x34, 128 thread, fp16
+    constexpr unsigned NPerBlock  = 2;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 4;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 32;
+
+    constexpr unsigned NPerThread  = 2;
+    constexpr unsigned KPerThread  = 4;
+    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned HoPerThread = 2;
+    constexpr unsigned WoPerThread = 2;
+
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
+    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+
    constexpr unsigned BlockSize = 128;
 #endif

@@ -57,7 +78,8 @@ void device_direct_convolution_2(InDesc,

    for(unsigned i = 0; i < nrepeat; ++i)
    {
-        float time = launch_kernel(gridwise_direct_convolution_2<T,
+        float time =
+            launch_kernel(gridwise_direct_convolution_2_nchw_kcyx_nkhw<T,
                                                                       InDesc,
                                                                       WeiDesc,
                                                                       OutDesc,
@@ -71,6 +93,8 @@ void device_direct_convolution_2(InDesc,
                                                                       CPerThread,
                                                                       HoPerThread,
                                                                       WoPerThread,
+                                                                       InBlockCopyDataPerRead,
+                                                                       WeiBlockCopyDataPerRead,
                                                                       BlockSize,
                                                                       GridSize>,
                          dim3(GridSize),

--- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp"
+
+template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
+void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
+                                                           const Tensor<TInWei>& in_nchw,
+                                                           WeiDesc,
+                                                           const Tensor<TInWei>& wei_kcyx,
+                                                           OutDesc,
+                                                           Tensor<TOut>& out_nkhw,
+                                                           unsigned nrepeat)
+{
+    // this suppose in / wei data type is int8x4
+    constexpr unsigned NVector = 4;
+    using accum_t              = int32_t;
+    using vector_t             = vector_type<TInWei, NVector>;
+    using vector_mem_t         = typename vector_t::MemoryType;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto wei_kcyx_desc = WeiDesc{};
+    constexpr auto out_nkhw_desc = OutDesc{};
+
+    constexpr unsigned Hi = in_nchw_desc.GetLength(I2);
+    constexpr unsigned Wi = in_nchw_desc.GetLength(I3);
+
+    constexpr unsigned N  = out_nkhw_desc.GetLength(I0);
+    constexpr unsigned Ho = out_nkhw_desc.GetLength(I2);
+    constexpr unsigned Wo = out_nkhw_desc.GetLength(I3);
+
+    constexpr unsigned K = wei_kcyx_desc.GetLength(I0);
+    constexpr unsigned C = wei_kcyx_desc.GetLength(I1);
+    constexpr unsigned Y = wei_kcyx_desc.GetLength(I2);
+    constexpr unsigned X = wei_kcyx_desc.GetLength(I3);
+
+    // vectorized input
+    auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{});
+    ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: ");
+
+    Tensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
+
+    auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) {
+#if 0
+        in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w);
+#elif 0
+        in_nchw_vec(n, c, h, w) =
+            vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
+#elif 1
+        in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w),
+                                                 in_nchw(n, 4 * c + 1, h, w),
+                                                 in_nchw(n, 4 * c + 2, h, w),
+                                                 in_nchw(n, 4 * c + 3, h, w));
+#endif
+    };
+
+    make_ParallelTensorFunctor(f_vectorized_nchw, N, C / NVector, Hi, Wi)(
+        std::thread::hardware_concurrency());
+
+    // vectorize weight
+    auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{});
+    ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: ");
+
+    Tensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
+
+    auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) {
+#if 0
+        wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x);
+#elif 0
+        wei_kcyx_vec(k, c, y, x) =
+            vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x));
+#elif 1
+        wei_kcyx_vec(k, c, y, x) = vector_t::Pack(wei_kcyx(k, 4 * c, y, x),
+                                                  wei_kcyx(k, 4 * c + 1, y, x),
+                                                  wei_kcyx(k, 4 * c + 2, y, x),
+                                                  wei_kcyx(k, 4 * c + 3, y, x));
+#endif
+    };
+
+    make_ParallelTensorFunctor(f_vectorized_kcyx, K, C / NVector, Y, X)(
+        std::thread::hardware_concurrency());
+
+    //
+    DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(sizeof(TOut) * out_nkhw.mDesc.GetElementSpace());
+
+    in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data());
+    wei_kcyx_vec_device_buf.ToDevice(wei_kcyx_vec.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+
+#if 0
+    // 3x3, 34x34, 128 thread, fp32, vector = 1
+    constexpr unsigned NPerBlock  = 2;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 4;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 32;
+
+    constexpr unsigned NPerThread  = 2;
+    constexpr unsigned KPerThread  = 4;
+    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned HoPerThread = 2;
+    constexpr unsigned WoPerThread = 2;
+
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
+    constexpr unsigned WeiBlockCopyDataPerRead = 2;
+
+    constexpr unsigned BlockSize = 128;
+#elif 0
+    // 3x3, 34x34, 128 thread, fp32, vector = 2
+    constexpr unsigned NPerBlock  = 2;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 2;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 32;
+
+    constexpr unsigned NPerThread  = 2;
+    constexpr unsigned KPerThread  = 4;
+    constexpr unsigned CPerThread  = 1;
+    constexpr unsigned HoPerThread = 2;
+    constexpr unsigned WoPerThread = 2;
+
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
+    constexpr unsigned WeiBlockCopyDataPerRead = 2;
+
+    constexpr unsigned BlockSize = 128;
+#elif 0
+    // 3x3, 34x34, 128 thread, int8, vector = 4
+    constexpr unsigned NPerBlock  = 2;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 8;
+    constexpr unsigned HoPerBlock = 4;
+    constexpr unsigned WoPerBlock = 32;
+
+    constexpr unsigned NPerThread  = 1;
+    constexpr unsigned KPerThread  = 8;
+    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned HoPerThread = 4;
+    constexpr unsigned WoPerThread = 2;
+
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
+    constexpr unsigned WeiBlockCopyDataPerRead = 2;
+
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // 1x1, 32x32, 128 thread, int8, vector = 4
+    constexpr unsigned NPerBlock  = 1;
+    constexpr unsigned KPerBlock  = 64;
+    constexpr unsigned CPerBlock  = 16;
+    constexpr unsigned HoPerBlock = 4;
+    constexpr unsigned WoPerBlock = 32;
+
+    constexpr unsigned NPerThread  = 1;
+    constexpr unsigned KPerThread  = 8;
+    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned HoPerThread = 4;
+    constexpr unsigned WoPerThread = 2;
+
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
+    constexpr unsigned WeiBlockCopyDataPerRead = 2;
+
+    constexpr unsigned BlockSize = 128;
+#endif
+
+    constexpr unsigned GridSize =
+        (N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    for(unsigned i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(
+            gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw<TInWei,
+                                                                    TOut,
+                                                                    accum_t,
+                                                                    decltype(in_nchw_vec_desc),
+                                                                    decltype(wei_kcyx_vec_desc),
+                                                                    decltype(out_nkhw_desc),
+                                                                    NVector,
+                                                                    NPerBlock,
+                                                                    KPerBlock,
+                                                                    CPerBlock,
+                                                                    HoPerBlock,
+                                                                    WoPerBlock,
+                                                                    NPerThread,
+                                                                    KPerThread,
+                                                                    CPerThread,
+                                                                    HoPerThread,
+                                                                    WoPerThread,
+                                                                    InBlockCopyDataPerRead,
+                                                                    WeiBlockCopyDataPerRead,
+                                                                    BlockSize,
+                                                                    GridSize>,
+            dim3(GridSize),
+            dim3(BlockSize),
+            static_cast<TInWei*>(in_nchw_vec_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(wei_kcyx_vec_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(out_nkhw_device_buf.GetDeviceBuffer()));
+
+        printf("Elapsed time : %f ms\n", time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+}
--- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
@@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
    out_khwn_device_buf.ToDevice(out_khwn.mData.data());

-#if 1
+#if 0
    // for 3x3, 34x34
    constexpr unsigned NPerBlock  = 16;
    constexpr unsigned KPerBlock  = 64;
@@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
    constexpr unsigned WoPerThread = 1;

    constexpr unsigned BlockSize = 128;
-#elif 1
+#elif 0
    // for 1x1, 28x28
    constexpr unsigned NPerBlock  = 16;
    constexpr unsigned KPerBlock  = 128;
@@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,

    constexpr unsigned OutThreadCopyDataPerWrite = 2;

+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // for 1x1, 14x14
+    constexpr unsigned NPerBlock  = 16;
+    constexpr unsigned KPerBlock  = 128;
+    constexpr unsigned CPerBlock  = 8;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 2;
+
+    constexpr unsigned NPerThread  = 4;
+    constexpr unsigned KPerThread  = 16;
+    constexpr unsigned CPerThread  = 1;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 1;
+
+    constexpr unsigned InBlockCopy_ThreadPerDimC = 8;
+    constexpr unsigned InBlockCopy_ThreadPerDimH = 2;
+    constexpr unsigned InBlockCopy_ThreadPerDimW = 2;
+    constexpr unsigned InBlockCopy_ThreadPerDimN = 4;
+    constexpr unsigned InBlockCopyDataPerRead    = 4;
+
+    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+
+    constexpr unsigned GemmMPerThreadSubC = 4;
+    constexpr unsigned GemmNPerThreadSubC = 4;
+    constexpr unsigned GemmMLevel0Cluster = 4;
+    constexpr unsigned GemmNLevel0Cluster = 2;
+    constexpr unsigned GemmMLevel1Cluster = 2;
+    constexpr unsigned GemmNLevel1Cluster = 4;
+    constexpr unsigned GemmKPerThreadLoop = 1;
+
+    constexpr unsigned OutThreadCopyDataPerWrite = 2;
+
    constexpr unsigned BlockSize = 128;
 #endif


--- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp"
 #include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"

 template <class T, class InDesc, class WeiDesc, class OutDesc>
@@ -209,9 +210,13 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,

    for(unsigned i = 0; i < nrepeat; ++i)
    {
-        float time =
-            launch_kernel(gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer<
-                              GridSize,
+        float time = launch_kernel(
+#if 0
+            gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
+#else
+            gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer
+#endif
+            <GridSize,
             BlockSize,
             T,
             decltype(in_chwn_desc),

--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -7,10 +7,11 @@
 #include "tensor.hpp"
 #include "ConstantTensorDescriptor.hip.hpp"
 #include "conv_common.hip.hpp"
-#include "device_direct_convolution_1.hpp"
-#include "device_direct_convolution_2.hpp"
+//#include "device_direct_convolution_1.hpp"
+#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
+//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
 #include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
-#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
+//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
 #include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"

 struct GeneratorTensor_1
@@ -34,25 +35,6 @@ struct GeneratorTensor_2
    }
 };

-struct GeneratorTensor_3
-{
-    template <class... Is>
-    double operator()(Is... is)
-    {
-#if 0
-        std::initializer_list<std::size_t> ls = {static_cast<std::size_t>(is)...};
-        return std::accumulate(ls.begin(), ls.end(), std::size_t(0));
-#elif 1
-        assert(sizeof...(Is) > 0);
-        std::initializer_list<std::size_t> ids = {static_cast<std::size_t>(is)...};
-        std::vector<std::size_t> lens(sizeof...(Is), 100);
-        std::vector<std::size_t> strides(sizeof...(Is), 1);
-        std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is)-1), strides.rbegin() + 1);
-        return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1;
-#endif
-    }
-};
-
 struct GeneratorTensor_Checkboard
 {
    template <class... Ts>
@@ -106,9 +88,12 @@ auto make_TensorDescriptor(TConstTensorDesc)
    return TensorDescriptor(lengths, strides);
 }

-template <class T, class LowerPads, class UpperPads>
-void host_direct_convolution(
-    const Tensor<T>& in_nchw, const Tensor<T>& wei_kcyx, Tensor<T>& out, LowerPads, UpperPads)
+template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
+void host_direct_convolution(const Tensor<TIn>& in_nchw,
+                             const Tensor<TWei>& wei_kcyx,
+                             Tensor<TOut>& out_nkhw,
+                             LowerPads,
+                             UpperPads)
 {
    unsigned h_pad_low = LowerPads{}.Get(Number<0>{});
    unsigned w_pad_low = LowerPads{}.Get(Number<1>{});
@@ -129,26 +114,29 @@ void host_direct_convolution(
                    if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
                       wi < in_nchw.mDesc.GetLengths()[3])
                    {
-                        v += in_nchw(n, c, hi, wi) * wei_kcyx(k, c, y, x);
+                        v += double(in_nchw(n, c, hi, wi)) * double(wei_kcyx(k, c, y, x));
                    }
                }
            }
        }
-        out(n, k, ho, wo) = v;
+        out_nkhw(n, k, ho, wo) = v;
    };

    auto f_par = make_ParallelTensorFunctor(f,
-                                            out.mDesc.GetLengths()[0],
-                                            out.mDesc.GetLengths()[1],
-                                            out.mDesc.GetLengths()[2],
-                                            out.mDesc.GetLengths()[3]);
+                                            out_nkhw.mDesc.GetLengths()[0],
+                                            out_nkhw.mDesc.GetLengths()[1],
+                                            out_nkhw.mDesc.GetLengths()[2],
+                                            out_nkhw.mDesc.GetLengths()[3]);

    f_par(std::thread::hardware_concurrency());
 }

-template <class T, class LowerPads, class UpperPads>
-void host_winograd_3x3_convolution(
-    const Tensor<T>& in_nchw, const Tensor<T>& wei_kcyx, Tensor<T>& out, LowerPads, UpperPads)
+template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
+void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
+                                   const Tensor<TWei>& wei_kcyx,
+                                   Tensor<TOut>& out_nkhw,
+                                   LowerPads,
+                                   UpperPads)
 {
    constexpr std::size_t HoPerTile = 2;
    constexpr std::size_t WoPerTile = 2;
@@ -162,8 +150,8 @@ void host_winograd_3x3_convolution(
    std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
    std::size_t X = wei_kcyx.mDesc.GetLengths()[3];

-    std::size_t HO = out.mDesc.GetLengths()[2];
-    std::size_t WO = out.mDesc.GetLengths()[3];
+    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
+    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];

    unsigned h_pad_low = LowerPads{}.Get(Number<0>{});
    unsigned w_pad_low = LowerPads{}.Get(Number<1>{});
@@ -177,11 +165,11 @@ void host_winograd_3x3_convolution(
    std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
    std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;

-    Tensor<T> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
-    Tensor<T> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
-    Tensor<T> wei_transform({K, C, HiPerTile, WiPerTile});
-    Tensor<T> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
-    Tensor<T> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
+    Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
+    Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
+    Tensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
+    Tensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
+    Tensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});

    auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) {
        for(int j = 0; j < HiPerTile; ++j)
@@ -198,7 +186,7 @@ void host_winograd_3x3_convolution(
                }
                else
                {
-                    in_hold(n, c, htile, wtile, j, i) = T(0);
+                    in_hold(n, c, htile, wtile, j, i) = TIn(0);
                }
            }
        }
@@ -259,49 +247,61 @@ void host_winograd_3x3_convolution(
    };

    auto f_wei_transform = [&](auto k, auto c) {
-        wei_transform(k, c, 0, 0) = wei_kcyx(k, c, 0, 0);
-        wei_transform(k, c, 0, 1) =
-            0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2);
-        wei_transform(k, c, 0, 2) =
-            0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2);
-        wei_transform(k, c, 0, 3) = wei_kcyx(k, c, 0, 2);
-
-        wei_transform(k, c, 1, 0) =
-            0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0);
-        wei_transform(k, c, 1, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) +
-                                    0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) +
-                                    0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) +
-                                    0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) +
-                                    0.25 * wei_kcyx(k, c, 2, 2);
-        wei_transform(k, c, 1, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) +
-                                    0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) -
-                                    0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) +
-                                    0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) +
-                                    0.25 * wei_kcyx(k, c, 2, 2);
-        wei_transform(k, c, 1, 3) =
-            0.5 * wei_kcyx(k, c, 0, 2) + 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2);
-
-        wei_transform(k, c, 2, 0) =
-            0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0);
-        wei_transform(k, c, 2, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) +
-                                    0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) -
-                                    0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) +
-                                    0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) +
-                                    0.25 * wei_kcyx(k, c, 2, 2);
-        wei_transform(k, c, 2, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) +
-                                    0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) +
-                                    0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) +
-                                    0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) +
-                                    0.25 * wei_kcyx(k, c, 2, 2);
-        wei_transform(k, c, 2, 3) =
-            0.5 * wei_kcyx(k, c, 0, 2) - 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2);
-
-        wei_transform(k, c, 3, 0) = wei_kcyx(k, c, 2, 0);
-        wei_transform(k, c, 3, 1) =
-            0.5 * wei_kcyx(k, c, 2, 0) + 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2);
-        wei_transform(k, c, 3, 2) =
-            0.5 * wei_kcyx(k, c, 2, 0) - 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2);
-        wei_transform(k, c, 3, 3) = wei_kcyx(k, c, 2, 2);
+        wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0));
+        wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 0, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 0, 2));
+        wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
+                                    0.5 * double(wei_kcyx(k, c, 0, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 0, 2));
+        wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2));
+
+        wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 1, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 0));
+        wei_transform(k, c, 1, 1) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) +
+            0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 1, 2) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) -
+            0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) +
+                                    0.5 * double(wei_kcyx(k, c, 1, 2)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+
+        wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
+                                    0.5 * double(wei_kcyx(k, c, 1, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 0));
+        wei_transform(k, c, 2, 1) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) -
+            0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 2, 2) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) +
+            0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) -
+                                    0.5 * double(wei_kcyx(k, c, 1, 2)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+
+        wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0));
+        wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) -
+                                    0.5 * double(wei_kcyx(k, c, 2, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2));
    };

    auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) {
@@ -354,7 +354,7 @@ void host_winograd_3x3_convolution(
            for(int i = 0; i < WoPerTile; ++i)
            {
                std::size_t wo = WoPerTile * wtile + i;
-                out(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
+                out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
            }
        }
    };
@@ -372,20 +372,25 @@ void host_winograd_3x3_convolution(
 template <class T>
 void check_error(const Tensor<T>& ref, const Tensor<T>& result)
 {
+    // printf("\n");
+
    float error     = 0;
    float max_diff  = -1;
    float ref_value = 0, result_value = 0;
    for(int i = 0; i < ref.mData.size(); ++i)
    {
-        error += std::abs(ref.mData[i] - result.mData[i]);
-        float diff = std::abs(ref.mData[i] - result.mData[i]);
+        error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
        if(max_diff < diff)
        {
            max_diff     = diff;
            ref_value    = ref.mData[i];
            result_value = result.mData[i];
        }
+
+        // printf("{%f, %f}", double(ref.mData[i]), double(result.mData[i]));
    }
+    // printf("\n");

    std::cout << "error: " << error << std::endl;
    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
@@ -404,7 +409,7 @@ int main(int argc, char* argv[])

    constexpr unsigned HPad = 0;
    constexpr unsigned WPad = 0;
-#elif 1
+#elif 0
    // 3x3, 34x34
    constexpr unsigned N  = 64;
    constexpr unsigned C  = 256;
@@ -563,6 +568,30 @@ int main(int argc, char* argv[])

    constexpr unsigned HPad = 2;
    constexpr unsigned WPad = 2;
+#elif 0
+    // 1x1 filter, 32x32 image
+    constexpr unsigned N  = 64;
+    constexpr unsigned C  = 256;
+    constexpr unsigned HI = 32;
+    constexpr unsigned WI = 32;
+    constexpr unsigned K  = 512;
+    constexpr unsigned Y  = 1;
+    constexpr unsigned X  = 1;
+
+    constexpr unsigned HPad = 0;
+    constexpr unsigned WPad = 0;
+#elif 1
+    // 1x1 filter, 14x14 image
+    constexpr unsigned N  = 128;
+    constexpr unsigned C  = 2048;
+    constexpr unsigned HI = 14;
+    constexpr unsigned WI = 14;
+    constexpr unsigned K  = 512;
+    constexpr unsigned Y  = 1;
+    constexpr unsigned X  = 1;
+
+    constexpr unsigned HPad = 0;
+    constexpr unsigned WPad = 0;
 #endif

    auto lower_pads = Sequence<HPad, WPad>{};
@@ -577,10 +606,12 @@ int main(int argc, char* argv[])
    ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
    ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");

-    Tensor<float> in_nchw(make_TensorDescriptor(in_nchw_desc));
-    Tensor<float> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
-    Tensor<float> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
-    Tensor<float> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
+    using in_data_t  = float;
+    using out_data_t = float;
+    Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
+    Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
+    Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
+    Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));

    std::size_t num_thread = std::thread::hardware_concurrency();

@@ -601,9 +632,13 @@ int main(int argc, char* argv[])
 #elif 1
        in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-#elif 1
-        in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+#elif 0
+        in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
 #endif
    }

@@ -611,7 +646,9 @@ int main(int argc, char* argv[])
 #if 0
    device_direct_convolution_1
 #elif 0
-    device_direct_convolution_2
+    device_direct_convolution_2_nchw_kcyx_nkhw
+#elif 0
+    device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
 #elif 1
    device_implicit_gemm_convolution_1_chwn_cyxk_khwn
 #elif 0
@@ -633,7 +670,6 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-#if 1
        if(Y == 3 && X == 3)
        {
            host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
@@ -643,7 +679,6 @@ int main(int argc, char* argv[])
            host_direct_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
        }
        check_error(out_nkhw_host, out_nkhw_device);
-#endif

 #if 0
        LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;

--- a/src/include/Array.hip.hpp
+++ b/src/include/Array.hip.hpp
+#pragma once
+
+template <class TData, unsigned NSize>
+struct Array
+{
+    using Type = Array<TData, NSize>;
+
+    static constexpr unsigned nSize = NSize;
+
+    unsigned mData[nSize];
+
+    template <class... Xs>
+    __host__ __device__ Array(Xs... xs) : mData{static_cast<TData>(xs)...}
+    {
+    }
+
+    __host__ __device__ TData operator[](unsigned i) const { return mData[i]; }
+};
--- a/src/include/ConstantTensorDescriptor.hip.hpp
+++ b/src/include/ConstantTensorDescriptor.hip.hpp
@@ -65,8 +65,8 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0
 template <class Lengths, class Strides>
 struct ConstantTensorDescriptor
 {
+    using Type                     = ConstantTensorDescriptor<Lengths, Strides>;
    static constexpr unsigned nDim = Lengths::nDim;
-    using NDimConstant             = Number<nDim>;

    __host__ __device__ constexpr ConstantTensorDescriptor()
    {
@@ -91,293 +91,70 @@ struct ConstantTensorDescriptor
        return Strides{}.Get(Number<I>{});
    }

-    __host__ __device__ constexpr unsigned GetElementSize() const
+    // c++14 doesn't support constexpr lambdas, has to use this trick instead
+    struct GetElementSize_f
    {
-        static_assert(nDim >= 2 && nDim <= 8, "nDim");
-
-        if(nDim == 2)
+        template <class IDim>
+        __host__ __device__ constexpr unsigned operator()(IDim idim) const
        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-
-            return GetLength(I0) * GetLength(I1);
+            return Type{}.GetLength(idim);
        }
-        else if(nDim == 3)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
+    };

-            return GetLength(I0) * GetLength(I1) * GetLength(I2);
-        }
-        else if(nDim == 4)
+    __host__ __device__ constexpr unsigned GetElementSize() const
    {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-
-            return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3);
-        }
-        else if(nDim == 5)
+        // c++14 doesn't support constexpr lambdas, has to use this trick instead
+        struct multiply
        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-
-            return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4);
-        }
-        else if(nDim == 6)
+            __host__ __device__ constexpr unsigned operator()(unsigned a, unsigned b) const
            {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-            constexpr auto I5 = Number<5>{};
-
-            return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) *
-                   GetLength(I5);
+                return a * b;
            }
-        else if(nDim == 7)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-            constexpr auto I5 = Number<5>{};
-            constexpr auto I6 = Number<6>{};
+        };

-            return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) *
-                   GetLength(I5) * GetLength(I6);
+        return static_const_reduce_n<nDim>{}(GetElementSize_f{}, multiply{});
    }
-        else if(nDim == 8)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-            constexpr auto I5 = Number<5>{};
-            constexpr auto I6 = Number<6>{};
-            constexpr auto I7 = Number<7>{};

-            return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) *
-                   GetLength(I5) * GetLength(I6) * GetLength(I7);
-        }
-        else
+    // c++14 doesn't support constexpr lambdas, has to use this trick instead
+    struct GetElementSpace_f
    {
-            assert(false);
-        }
+        template <class IDim>
+        __host__ __device__ constexpr unsigned operator()(IDim idim) const
+        {
+            return (Type{}.GetLength(idim) - 1) * Type{}.GetStride(idim);
        }
+    };

    template <class Align = Number<1>>
    __host__ __device__ constexpr unsigned GetElementSpace(Align align = Align{}) const
    {
-        static_assert(nDim >= 2 && nDim <= 8, "nDim");
-
-        constexpr unsigned align_size = align.Get();
-
-        if(nDim == 2)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   align_size;
-        }
-        else if(nDim == 3)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + align_size;
-        }
-        else if(nDim == 4)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
-                   align_size;
-        }
-        else if(nDim == 5)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
-                   (GetLength(I4) - 1) * GetStride(I4) + align_size;
-        }
-        else if(nDim == 6)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-            constexpr auto I5 = Number<5>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
-                   (GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) +
-                   align_size;
-        }
-        else if(nDim == 7)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-            constexpr auto I5 = Number<5>{};
-            constexpr auto I6 = Number<6>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
-                   (GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) +
-                   (GetLength(I6) - 1) * GetStride(I6) + align_size;
-        }
-        else if(nDim == 8)
+        // c++14 doesn't support constexpr lambdas, has to use this trick instead
+        struct add
        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-            constexpr auto I5 = Number<5>{};
-            constexpr auto I6 = Number<6>{};
-            constexpr auto I7 = Number<7>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
-                   (GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) +
-                   (GetLength(I6) - 1) * GetStride(I6) + (GetLength(I7) - 1) * GetStride(I7) +
-                   align_size;
-        }
-    }
-
-    // this is ugly, only for 2d
-    __host__ __device__ unsigned Get1dIndex(unsigned i0, unsigned i1) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-
-        static_assert(nDim == 2, "nDim is not 2");
-        return i0 * GetStride(I0) + i1 * GetStride(I1);
-    }
-
-    // this is ugly, only for 3d
-    __host__ __device__ unsigned Get1dIndex(unsigned i0, unsigned i1, unsigned i2) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-
-        static_assert(nDim == 3, "nDim is not 3");
-        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2);
-    }
-
-    // this is ugly, only for 4d
-    __host__ __device__ unsigned
-    Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3) const
+            __host__ __device__ constexpr unsigned operator()(unsigned a, unsigned b) const
            {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        static_assert(nDim == 4, "nDim is not 4");
-        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3);
+                return a + b;
            }
+        };

-    // this is ugly, only for 5d
-    __host__ __device__ unsigned
-    Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3, unsigned i4) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-
-        static_assert(nDim == 5, "nDim is not 5");
-        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
-               i4 * GetStride(I4);
+        return static_const_reduce_n<nDim>{}(GetElementSpace_f{}, add{}) + align.Get();
    }

-    // this is ugly, only for 6d
-    __host__ __device__ unsigned
-    Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3, unsigned i4, unsigned i5) const
+    template <class... Is>
+    __host__ __device__ unsigned Get1dIndex(Is... is) const
    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-        constexpr auto I5 = Number<5>{};
+        static_assert(sizeof...(Is) == nDim, "number of multi-index is wrong");

-        static_assert(nDim == 6, "nDim is not 6");
-        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
-               i4 * GetStride(I4) + i5 * GetStride(I5);
-    }
+        const auto multi_id = Array<unsigned, nDim>(is...);

-    // this is ugly, only for 7d
-    __host__ __device__ unsigned Get1dIndex(unsigned i0,
-                                            unsigned i1,
-                                            unsigned i2,
-                                            unsigned i3,
-                                            unsigned i4,
-                                            unsigned i5,
-                                            unsigned i6) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-        constexpr auto I5 = Number<5>{};
-        constexpr auto I6 = Number<6>{};
-
-        static_assert(nDim == 7, "nDim is not 7");
-        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
-               i4 * GetStride(I4) + i5 * GetStride(I5) + i6 * GetStride(I6);
-    }
+        unsigned id = 0;

-    // this is ugly, only for 8d
-    __host__ __device__ unsigned Get1dIndex(unsigned i0,
-                                            unsigned i1,
-                                            unsigned i2,
-                                            unsigned i3,
-                                            unsigned i4,
-                                            unsigned i5,
-                                            unsigned i6,
-                                            unsigned i7) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-        constexpr auto I5 = Number<5>{};
-        constexpr auto I6 = Number<6>{};
-        constexpr auto I7 = Number<7>{};
+        static_loop_n<nDim>{}([&](auto IDim) {
+            constexpr unsigned idim = IDim.Get();
+            id += multi_id[idim] * GetStride(IDim);
+        });

-        static_assert(nDim == 8, "nDim is not 8");
-        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
-               i4 * GetStride(I4) + i5 * GetStride(I5) + i6 * GetStride(I6) + i7 * GetStride(I7);
+        return id;
    }

    __host__ __device__ constexpr auto Condense() const
@@ -385,6 +162,12 @@ struct ConstantTensorDescriptor
        constexpr auto default_strides = calculate_default_strides(Lengths{});
        return ConstantTensorDescriptor<Lengths, decltype(default_strides)>{};
    }
+
+    template <unsigned IDim, unsigned NVector>
+    __host__ __device__ constexpr auto Vectorize(Number<IDim>, Number<NVector>) const
+    {
+        assert(false); // not implemented
+    }
 };

 template <class Lengths>

--- a/src/include/Sequence.hip.hpp
+++ b/src/include/Sequence.hip.hpp
+#pragma once
+#include "constant_integral.hip.hpp"
+#include "functional.hip.hpp"
+
+template <unsigned... Is>
+struct Sequence
+{
+    using Type = Sequence<Is...>;
+
+    static constexpr unsigned nDim = sizeof...(Is);
+
+    const unsigned mData[nDim] = {Is...};
+
+    template <unsigned I>
+    __host__ __device__ constexpr unsigned Get(Number<I>) const
+    {
+        return mData[I];
+    }
+
+    // this is ugly, only for nDIm = 4
+    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
+    __host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence<I0, I1, I2, I3>) const
+    {
+        static_assert(nDim == 4, "nDim != 4");
+
+        constexpr auto old_sequence = Type{};
+
+        constexpr unsigned NR0 = old_sequence.mData[I0];
+        constexpr unsigned NR1 = old_sequence.mData[I1];
+        constexpr unsigned NR2 = old_sequence.mData[I2];
+        constexpr unsigned NR3 = old_sequence.mData[I3];
+
+        return Sequence<NR0, NR1, NR2, NR3>{};
+    }
+
+    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
+    __host__ __device__ constexpr auto ReorderByPutOldToNew(Sequence<I0, I1, I2, I3>) const
+    {
+        // don't know how to implement this
+        printf("Sequence::ReorderByPutOldToNew not implemented");
+        assert(false);
+    }
+
+    template <unsigned I>
+    __host__ __device__ constexpr auto PushBack(Number<I>) const
+    {
+        return Sequence<Is..., I>{};
+    }
+
+    __host__ __device__ constexpr auto PopBack() const;
+
+    template <class F>
+    __host__ __device__ constexpr auto Transform(F f) const
+    {
+        return Sequence<f(Is)...>{};
+    }
+};
+
+template <unsigned... Is, unsigned I>
+__host__ __device__ constexpr auto sequence_pop_back(Sequence<Is..., I>)
+{
+    static_assert(sizeof...(Is) >= 1, "empty Sequence!");
+    return Sequence<Is...>{};
+}
+
+template <class F, unsigned... Xs, unsigned... Ys>
+__host__ __device__ constexpr auto sequence_sequence_op(Sequence<Xs...>, Sequence<Ys...>, F f)
+{
+    static_assert(Sequence<Xs...>::nDim == Sequence<Ys...>::nDim, "Dim not the same");
+
+    return Sequence<f(Xs, Ys)...>{};
+}
+
+template <unsigned... Xs, unsigned... Ys>
+__host__ __device__ constexpr auto sequence_sequence_add(Sequence<Xs...>, Sequence<Ys...>)
+{
+    struct add
+    {
+        __host__ __device__ constexpr unsigned operator()(unsigned x, unsigned y) const
+        {
+            return x + y;
+        }
+    };
+
+    return sequence_sequence_op(Sequence<Xs...>{}, Sequence<Ys...>{}, add{});
+}
+
+template <unsigned... Is>
+__host__ __device__ constexpr auto Sequence<Is...>::PopBack() const
+{
+    return sequence_pop_back(Type{});
+}
--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -373,7 +373,7 @@ template <unsigned BlockSize,
          unsigned DataPerRead>
 struct Blockwise2dTensorCopy3
 {
-    using vector_t = typename vector_type<Float, DataPerRead>::type;
+    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;

    unsigned mSrcMyThreadOffset;
    unsigned mDstMyThreadOffset;
@@ -383,8 +383,9 @@ struct Blockwise2dTensorCopy3
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

-        static_assert(SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1,
-                      "wrong! only support stride1 == 1!\n");
+        static_assert(DataPerRead == 1 ||
+                          (SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1),
+                      "wrong! only support stride1 == 1 if DataPerRead > 1!\n");

        static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
                      "wrong! only support DataPerRead == 1, 2 or 4!\n");

--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -131,11 +131,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds

        did[3] = is / ref_desc.GetStride(I3);

-        const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
+        const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);

-        const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
+        const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);

-        f(p_src[aindex], p_dst[bindex]);
+        f(p_src[src_index], p_dst[dst_index]);
    }

    constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
@@ -162,11 +162,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds

            did[3] = is / ref_desc.GetStride(I3);

-            const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
+            const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);

-            const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
+            const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);

-            f(p_src[aindex], p_dst[bindex]);
+            f(p_src[src_index], p_dst[dst_index]);
        }
    }
 }
@@ -199,15 +199,110 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
        SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
 }

-template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
+template <unsigned BlockSize,
+          class Float,
+          class SrcDesc,
+          class DstDesc,
+          class CopyLengths,
+          unsigned DataPerRead>
 struct Blockwise4dTensorCopy1
 {
+    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
+
+    __device__ constexpr Blockwise4dTensorCopy1()
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        static_assert(DataPerRead == 1 ||
+                          (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1),
+                      "wrong! only support stride3 == 1 if DataPerRead > 1!\n");
+
+        static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
+                      "wrong! only support DataPerRead == 1, 2 or 4!\n");
+
+        static_assert(SrcDesc{}.GetStride(I2) % DataPerRead == 0 &&
+                          DstDesc{}.GetStride(I2) % DataPerRead == 0,
+                      "src and dst stride2 should be multiple of DataPerRead to keep alignment");
+
+        // we allow out-of-bound read from src in D3 dimension,
+        //   but we need to make sure dst stride2 is big enough,
+        //   so that the out-of-bound write won't contaminate next line in dst
+        constexpr unsigned L3          = CopyLengths{}.Get(I3);
+        constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead);
+
+        static_assert(read_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2),
+                      "wrong! out-of-bound write will contaminate next line!\n");
+    }
+
    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
    {
-        constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto src_desc = SrcDesc{};
+        constexpr auto dst_desc = DstDesc{};
+
+        constexpr unsigned L0 = CopyLengths{}.Get(I0);
+        constexpr unsigned L1 = CopyLengths{}.Get(I1);
+        constexpr unsigned L2 = CopyLengths{}.Get(I2);
+        constexpr unsigned L3 = CopyLengths{}.Get(I3);
+
+        constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead);
+
+        constexpr auto ref_desc =
+            make_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
+
+        constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
+
+        auto f_copy = [&](unsigned is) {
+            unsigned did[4];
+
+            did[0] = is / ref_desc.GetStride(I0);
+
+            is -= did[0] * ref_desc.GetStride(I0);
+
+            did[1] = is / ref_desc.GetStride(I1);
+
+            is -= did[1] * ref_desc.GetStride(I1);
+
+            did[2] = is / ref_desc.GetStride(I2);
+
+            is -= did[2] * ref_desc.GetStride(I2);
+
+            did[3] = is / ref_desc.GetStride(I3);
+
+            const unsigned src_index =
+                src_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead);
+            const unsigned dst_index =
+                dst_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead);
+
+            *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
+                *(reinterpret_cast<const vector_t*>(p_src + src_index));
+        };
+
+        for(unsigned iloop = 0; iloop < NLoop; ++iloop)
+        {
+            unsigned is = threadIdx.x + iloop * BlockSize;

-        blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
-            SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder);
+            f_copy(is);
+        }
+
+        constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
+
+        if(has_tail)
+        {
+            unsigned is = threadIdx.x + NLoop * BlockSize;
+
+            if(is < ref_desc.GetElementSize())
+            {
+                f_copy(is);
+            }
+        }
    }
 };

@@ -350,7 +445,7 @@ template <unsigned BlockSize,
          unsigned DataPerRead>
 struct Blockwise4dTensorCopy3
 {
-    using vector_t = typename vector_type<Float, DataPerRead>::type;
+    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;

    unsigned mSrcMyThreadOffset;
    unsigned mDstMyThreadOffset;
@@ -362,8 +457,9 @@ struct Blockwise4dTensorCopy3
        constexpr auto I2 = Number<2>{};
        constexpr auto I3 = Number<3>{};

-        static_assert(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1,
-                      "wrong! only support stride3 == 1!\n");
+        static_assert(DataPerRead == 1 ||
+                          (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1),
+                      "wrong! only support stride3 == 1 if DataPerRead > 1!\n");

        static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
                      "wrong! only support DataPerRead == 1, 2 or 4!\n");
@@ -371,7 +467,7 @@ struct Blockwise4dTensorCopy3
        static_assert(
            SrcDesc{}.GetStride(I2) % DataPerRead == 0 &&
                DstDesc{}.GetStride(I2) % DataPerRead == 0,
-            "wrong! src and dst stride should be multiple of DataPerRead to keep alignment");
+            "wrong! src and dst stride2 should be multiple of DataPerRead to keep alignment");

        constexpr unsigned L0 = CopyLengths{}.Get(I0);
        constexpr unsigned L1 = CopyLengths{}.Get(I1);

--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -435,11 +435,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
 #pragma unroll
        for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
        {
-// read first batch of A, B
-//   copy A-sub to form A
-#pragma unroll
+            // read first batch of A, B
+            //   copy A-sub to form A
+            //#pragma unroll
            for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
            {
+#if 0
                threadwise_matrix_copy(
                    a_block_mtx,
                    p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
@@ -447,12 +448,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                    a_thread_mtx,
                    p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
                    a_thread_sub_mtx.GetLengths());
+#else
+                for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
+                {
+                    for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
+                    {
+                        p_a_thread[a_thread_mtx.Get1dIndex(i, m_repeat * MPerThreadSubC + j)] =
+                            p_a_block[a_block_mtx.Get1dIndex(k_begin + i,
+                                                             m_repeat * MPerLevel1Cluster + j) +
+                                      mMyThreadOffsetA];
+                    }
+                }
+#endif
            }

-//   copy B-sub to form B
-#pragma unroll
+            //   copy B-sub to form B
+            //#pragma unroll
            for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
            {
+#if 0
                threadwise_matrix_copy(
                    b_block_mtx,
                    p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
@@ -460,13 +474,26 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                    b_thread_mtx,
                    p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
                    b_thread_sub_mtx.GetLengths());
+#else
+                for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
+                {
+                    for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
+                    {
+                        p_b_thread[b_thread_mtx.Get1dIndex(i, n_repeat * NPerThreadSubC + j)] =
+                            p_b_block[b_block_mtx.Get1dIndex(k_begin + i,
+                                                             n_repeat * MPerLevel1Cluster + j) +
+                                      mMyThreadOffsetB];
+                    }
+                }
+#endif
            }

-// loop over batch
-#pragma unroll
+            // loop over batch
+            //#pragma unroll
            for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
            {
-                // do current batch of gemm
+// do current batch of gemm
+#if 0
                threadwise_gemm(a_thread_mtx,
                                True,
                                p_a_thread,
@@ -477,13 +504,32 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                                False,
                                p_c_thread + ib * ThreadMatrixStrideC,
                                f_accum);
+#else
+                for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k)
+                {
+                    for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i)
+                    {
+                        for(unsigned j = 0; j < c_thread_mtx.NCol(); ++j)
+                        {
+                            const unsigned aindex =
+                                a_thread_mtx.Get1dIndex(k, i); // A is transposed
+                            const unsigned bindex = b_thread_mtx.Get1dIndex(k, j);
+                            const unsigned cindex =
+                                c_thread_mtx.Get1dIndex(i, j) + ib * ThreadMatrixStrideC;
+
+                            f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
+                        }
+                    }
+                }
+#endif

                // read next batch of a, b
                if(BlockMatrixStrideA != 0)
                {
-#pragma unroll
+                    //#pragma unroll
                    for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
                    {
+#if 0
                        threadwise_matrix_copy(
                            a_block_mtx,
                            p_a_block +
@@ -492,14 +538,28 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                            a_thread_mtx,
                            p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
                            a_thread_sub_mtx.GetLengths());
+#else
+                        for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
+                        {
+                            for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
+                            {
+                                p_a_thread[a_thread_mtx.Get1dIndex(i,
+                                                                   m_repeat * MPerThreadSubC + j)] =
+                                    p_a_block[a_block_mtx.Get1dIndex(
+                                                  k_begin + i, m_repeat * MPerLevel1Cluster + j) +
+                                              (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA];
+                            }
+                        }
+#endif
                    }
                }

                if(BlockMatrixStrideB != 0)
                {
-#pragma unroll
+                    //#pragma unroll
                    for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
                    {
+#if 0
                        threadwise_matrix_copy(
                            b_block_mtx,
                            p_b_block +
@@ -508,11 +568,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                            b_thread_mtx,
                            p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
                            b_thread_sub_mtx.GetLengths());
+#else
+                        for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
+                        {
+                            for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
+                            {
+                                p_b_thread[b_thread_mtx.Get1dIndex(i,
+                                                                   n_repeat * NPerThreadSubC + j)] =
+                                    p_b_block[b_block_mtx.Get1dIndex(
+                                                  k_begin + i, n_repeat * MPerLevel1Cluster + j) +
+                                              (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB];
+                            }
+                        }
+#endif
                    }
                }
            }

-            // do last batch of gemm
+// do last batch of gemm
+#if 0
            threadwise_gemm(a_thread_mtx,
                            True,
                            p_a_thread,
@@ -523,6 +597,23 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                            False,
                            p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC,
                            f_accum);
+#else
+            for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k)
+            {
+                for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i)
+                {
+                    for(unsigned j = 0; j < c_thread_mtx.NCol(); ++j)
+                    {
+                        const unsigned aindex = a_thread_mtx.Get1dIndex(k, i); // A is transposed
+                        const unsigned bindex = b_thread_mtx.Get1dIndex(k, j);
+                        const unsigned cindex = c_thread_mtx.Get1dIndex(i, j) +
+                                                (BatchPerThread - 1) * ThreadMatrixStrideC;
+
+                        f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
+                    }
+                }
+            }
+#endif
        }
    }

@@ -551,9 +642,9 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
            c_thread_mtx_begin.batch * BlockMatrixStrideC +
            c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col);

-        for(unsigned m_repeat = 0; m_repeat, MRepeat; ++m_repeat)
+        for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
        {
-            for(unsigned n_repeat = 0; n_repeat, NRepeat; ++n_repeat)
+            for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
            {
                threadwise_matrix_copy(
                    c_thread_sub_mtx,

--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
 #pragma once
+#include "data_type.hip.hpp"
+#include "constant_integral.hip.hpp"
+#include "Sequence.hip.hpp"
+#include "Array.hip.hpp"
+#include "functional.hip.hpp"

 __device__ unsigned get_thread_local_1d_id() { return threadIdx.x; }

@@ -16,129 +21,6 @@ struct is_same<T, T>
    static const bool value = true;
 };

-template <class T, unsigned N>
-struct vector_type
-{
-};
-
-template <>
-struct vector_type<float, 1>
-{
-    using type = float;
-};
-
-template <>
-struct vector_type<float, 2>
-{
-    using type = float2;
-};
-
-template <>
-struct vector_type<float, 4>
-{
-    using type = float4;
-};
-
-#if 0
-template <>
-struct vector_type<half_float::half, 1>
-{
-    using type = half_float::half;
-};
-
-template <>
-struct vector_type<half_float::half, 2>
-{
-    using type = float;
-};
-
-template <>
-struct vector_type<half_float::half, 4>
-{
-    using type = float2;
-};
-
-template <>
-struct vector_type<half_float::half, 8>
-{
-    using type = float4;
-};
-#endif
-
-#if 0
-template <>
-struct vector_type<half, 1>
-{
-    using type = half;
-};
-
-template <>
-struct vector_type<half, 2>
-{
-    using type = half2;
-};
-
-template <>
-struct vector_type<half, 4>
-{
-    using type = float2;
-};
-
-template <>
-struct vector_type<half, 8>
-{
-    using type = float4;
-};
-#endif
-
-template <class T, T N>
-struct integral_constant
-{
-    static const T value = N;
-
-    __host__ __device__ constexpr T Get() const { return value; }
-};
-
-template <unsigned N>
-using Number = integral_constant<unsigned, N>;
-
-template <unsigned... Is>
-struct Sequence
-{
-    using Type = Sequence<Is...>;
-
-    static constexpr unsigned nDim = sizeof...(Is);
-
-    const unsigned mData[nDim] = {Is...};
-
-    template <unsigned I>
-    __host__ __device__ constexpr unsigned Get(Number<I>) const
-    {
-        return mData[I];
-    }
-
-    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
-    __host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence<I0, I1, I2, I3>) const
-    {
-        constexpr auto old_sequence = Type{};
-
-        constexpr unsigned NR0 = old_sequence.mData[I0];
-        constexpr unsigned NR1 = old_sequence.mData[I1];
-        constexpr unsigned NR2 = old_sequence.mData[I2];
-        constexpr unsigned NR3 = old_sequence.mData[I3];
-
-        return Sequence<NR0, NR1, NR2, NR3>{};
-    }
-
-    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
-    __host__ __device__ constexpr auto ReorderByPutOldToNew(Sequence<I0, I1, I2, I3>) const
-    {
-        // don't know how to implement this
-        printf("Sequence::ReorderByPutOldToNew not implemented");
-        assert(false);
-    }
-};
-
 #if DEVICE_BACKEND_CUDA
 template <typename T>
 __host__ __device__ constexpr T max(T a, T b)

--- a/src/include/config.h.in
+++ b/src/include/config.h.in
@@ -4,9 +4,10 @@

 #if DEVICE_BACKEND_HIP
 #include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
 #elif DEVICE_BACKEND_CUDA
 #include "cuda_runtime.h"
+#include "cuda_fp16.h"
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
-#include "cuda_fp16.h"
 #endif
--- a/src/include/constant_integral.hip.hpp
+++ b/src/include/constant_integral.hip.hpp
+#pragma once
+
+template <class T, T N>
+struct integral_constant
+{
+    static const T value = N;
+
+    __host__ __device__ constexpr T Get() const { return value; }
+};
+
+template <unsigned N>
+using Number = integral_constant<unsigned, N>;
--- a/src/include/data_type.hip.hpp
+++ b/src/include/data_type.hip.hpp
+#pragma once
+#include "config.h"
+
+template <class T, unsigned N>
+struct vector_type
+{
+};
+
+template <>
+struct vector_type<float, 1>
+{
+    using MemoryType = float;
+};
+
+template <>
+struct vector_type<float, 2>
+{
+    using MemoryType = float2;
+
+    __host__ __device__ static MemoryType Pack(float s0, float s1)
+    {
+        union
+        {
+            MemoryType vector;
+            float scalar[2];
+        } data;
+
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<float, 4>
+{
+    using MemoryType = float4;
+};
+
+template <>
+struct vector_type<float2, 2>
+{
+    using MemoryType = float4;
+};
+
+#if 0
+template <>
+struct vector_type<half, 1>
+{
+    using MemoryType = half;
+
+    __host__ __device__ static MemoryType Pack(half s) { return s; }
+};
+
+template <>
+struct vector_type<half, 2>
+{
+    using MemoryType = half2;
+
+    __host__ __device__ static MemoryType Pack(half s0, half s1)
+    {
+        union
+        {
+            MemoryType vector;
+            half scalar[2];
+        } data;
+
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<half, 4>
+{
+    using MemoryType = float2;
+};
+
+template <>
+struct vector_type<half, 8>
+{
+    using MemoryType = float4;
+};
+
+template <>
+struct vector_type<char, 1>
+{
+    using MemoryType = char;
+
+    __host__ __device__ static MemoryType Pack(char s) { return s; }
+};
+
+template <>
+struct vector_type<char, 2>
+{
+    using MemoryType = int16_t;
+
+    __host__ __device__ static MemoryType Pack(char s0, char s1)
+    {
+        union
+        {
+            MemoryType vector;
+            char scalar[2];
+        } data;
+
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<char, 4>
+{
+    using MemoryType = int32_t;
+
+    __host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3)
+    {
+        union
+        {
+            MemoryType vector;
+            char scalar[4];
+        } data;
+
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        data.scalar[2] = s2;
+        data.scalar[3] = s3;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<char, 8>
+{
+    using MemoryType = int64_t;
+};
+
+template <>
+struct vector_type<int32_t, 2>
+{
+    using MemoryType = int64_t;
+};
+
+template <>
+struct vector_type<char2, 2>
+{
+    using MemoryType = char4;
+};
+
+template <>
+struct vector_type<char2, 4>
+{
+    using MemoryType = int64_t;
+};
+
+template <>
+struct vector_type<char4, 1>
+{
+    using MemoryType = int;
+};
+
+template <>
+struct vector_type<char4, 2>
+{
+    using MemoryType = int64_t;
+};
+#endif
+
+__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
+{
+    d += s0 * s1;
+}
+
+__device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1)
+{
+    d += s0.x * s1.x;
+    d += s0.y * s1.y;
+}
+
+__device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1)
+{
+    d += s0.x * s1.x;
+    d += s0.y * s1.y;
+    d += s0.z * s1.z;
+    d += s0.w * s1.w;
+}
+
+#if 0
+__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
+
+__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
+{
+    d += s0.x * s1.x;
+    d += s0.y * s1.y;
+}
+
+__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
+{
+    d += s0.x * s1.x + s0.y * s1.y;
+}
+
+__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
+
+// TODO:: this interface is misleading, s0, s1 are actually int8x4
+//  need to make a better interface
+__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
+{
+#if DEVICE_BACKEND_CUDA
+    d = __dp4a(s0, s1, d);
+#endif
+}
+#endif
--- a/src/include/functional.hip.hpp
+++ b/src/include/functional.hip.hpp
+#pragma once
+#include "constant_integral.hip.hpp"
+
+template <unsigned NLoop>
+struct static_loop_n
+{
+    template <class F>
+    __host__ __device__ void operator()(F f) const
+    {
+        static_assert(NLoop > 1, "out-of-range");
+
+        f(Number<NLoop - 1>{});
+        static_loop_n<NLoop - 1>{}(f);
+    }
+};
+
+template <>
+struct static_loop_n<1>
+{
+    template <class F>
+    __host__ __device__ void operator()(F f) const
+    {
+        f(Number<0>{});
+    }
+};
+
+template <unsigned NLoop>
+struct static_const_reduce_n
+{
+    template <class F, class Reduce>
+    __host__ __device__ constexpr auto operator()(F f, Reduce r) const
+    {
+        static_assert(NLoop > 1, "out-of-range");
+
+        constexpr auto a = f(Number<NLoop - 1>{});
+        auto b = static_const_reduce_n<NLoop - 1>{}(f, r); // cannot use constexpr here, weird
+        return r(a, b);
+    }
+};
+
+template <>
+struct static_const_reduce_n<1>
+{
+    template <class F, class Reduce>
+    __host__ __device__ constexpr auto operator()(F f, Reduce) const
+    {
+        return f(Number<0>{});
+    }
+};
+
+#if 0
+template<class F>
+__host__ __device__ constexpr auto unpacker(F f)
+{
+    return [=](auto xs_array){ f(xs...); };
+}
+#endif
\ No newline at end of file
--- a/src/include/gridwise_direct_convolution_2.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2.hip.hpp
 #pragma once
 #include "common.hip.hpp"
 #include "ConstantTensorDescriptor.hip.hpp"
+#include "blockwise_2d_tensor_op.hip.hpp"
 #include "blockwise_4d_tensor_op.hip.hpp"
 #include "blockwise_direct_convolution.hip.hpp"
 #include "threadwise_4d_tensor_op.hip.hpp"
@@ -20,9 +21,12 @@ template <class Float,
          unsigned CPerThread,
          unsigned HoPerThread,
          unsigned WoPerThread,
+          unsigned InBlockCopyDataPerRead,
+          unsigned WeiBlockCopyDataPerRead,
          unsigned BlockSize,
          unsigned GridSize>
-__global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_in_global,
+__global__ void
+gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global,
                                             const Float* const __restrict__ p_wei_global,
                                             Float* const __restrict__ p_out_global)
 {
@@ -31,50 +35,72 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};

-    constexpr auto in_global_desc  = InGlobalDesc{};
-    constexpr auto wei_global_desc = WeiGlobalDesc{};
-    constexpr auto out_global_desc = OutGlobalDesc{};
+    constexpr auto in_nchw_global_desc  = InGlobalDesc{};
+    constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{};
+    constexpr auto out_nkhw_global_desc = OutGlobalDesc{};

-    constexpr unsigned Y = wei_global_desc.GetLength(I2);
-    constexpr unsigned X = wei_global_desc.GetLength(I3);
+    constexpr unsigned N = in_nchw_global_desc.GetLength(I0);
+    constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0);
+    constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1);
+    constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2);
+    constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3);
+
+    constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor(
+        Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy

    constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
    constexpr unsigned WiPerBlock = WoPerBlock + X - 1;

-    constexpr auto in_block_desc =
-        make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
+    constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
+
+    constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<KPerBlock, CPerBlock * Y * X>{},
+        Number<WeiBlockCopyDataPerRead>{}); // 2d view of wei for blockwise copy

-    constexpr auto wei_block_desc =
-        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{});
+    constexpr auto wei_kcyx_block_desc =
+        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{},
+                                      Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});

    // shared mem
-    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
-    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
+    constexpr unsigned in_block_size =
+        in_nchw_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
+    constexpr unsigned wei_block_size =
+        wei_kcyx_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
+
+    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                       ? InBlockCopyDataPerRead
+                                       : WeiBlockCopyDataPerRead;

-    __shared__ Float p_in_block[in_block_size];
-    __shared__ Float p_wei_block[wei_block_size];
+    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
+    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];

    // threadwise tensors
    constexpr unsigned HiPerThread = HoPerThread + Y - 1;
    constexpr unsigned WiPerThread = WoPerThread + X - 1;

-    constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{}, in_block_desc.GetStrides());
+    constexpr auto in_nchw_thread_block_desc =
+        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
+                                      in_nchw_block_desc.GetStrides());

-    constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<KPerThread, CPerThread, Y, X>{}, wei_block_desc.GetStrides());
+    constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor(
+        Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_block_desc.GetStrides());

-    constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
-        in_thread_block_desc, wei_thread_block_desc);
+    constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
+        in_nchw_thread_block_desc, wei_kcyx_thread_block_desc);

    // register
-    Float p_out_thread[out_thread_desc.GetElementSpace()];
+    Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()];

    // divide block work
-    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
+    constexpr unsigned NBlockWork =
+        (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr unsigned KBlockWork =
+        (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
+    constexpr unsigned HBlockWork =
+        (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
+    constexpr unsigned WBlockWork =
+        (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;

    const unsigned block_id = blockIdx.x;

@@ -121,26 +147,37 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
    constexpr auto blockwise_in_copy =
        Blockwise4dTensorCopy1<BlockSize,
                               Float,
-                               decltype(in_global_desc),
-                               decltype(in_block_desc),
-                               decltype(in_block_desc.GetLengths())>{};
+                               decltype(in_nchw_global_desc),
+                               decltype(in_nchw_block_desc),
+                               decltype(in_nchw_block_desc.GetLengths()),
+                               InBlockCopyDataPerRead>{};

+#if 0
    constexpr auto blockwise_wei_copy =
        Blockwise4dTensorCopy1<BlockSize,
                               Float,
-                               decltype(wei_global_desc),
-                               decltype(wei_block_desc),
-                               decltype(wei_block_desc.GetLengths())>{};
+                               decltype(wei_kcyx_global_desc),
+                               decltype(wei_kcyx_block_desc),
+                               decltype(wei_kcyx_block_desc.GetLengths()),
+                               1>{};
+#elif 1
+    const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
+                                                           Float,
+                                                           decltype(wei_ke_global_desc),
+                                                           decltype(wei_ke_block_desc),
+                                                           decltype(wei_ke_block_desc.GetLengths()),
+                                                           WeiBlockCopyDataPerRead>{};
+#endif

    // set threadwise output tensor to 0
-    threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
+    threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);

-    for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1);
+    for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
        c_block_data_begin += CPerBlock, __syncthreads())
    {
        // copy input tensor to LDS
        blockwise_in_copy.Run(p_in_global +
-                                  in_global_desc.Get1dIndex(n_block_data_begin,
+                                  in_nchw_global_desc.Get1dIndex(n_block_data_begin,
                                                                 c_block_data_begin,
                                                                 hi_block_data_begin,
                                                                 wi_block_data_begin),
@@ -148,7 +185,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_

        // copy weight tensor to LDS
        blockwise_wei_copy.Run(
-            p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
+            p_wei_global +
+                wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
            p_wei_block);

        __syncthreads();
@@ -158,27 +196,29 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
 // threadwise convolution
 #if 1
            threadwise_direct_convolution_2(
-                in_thread_block_desc,
+                in_nchw_thread_block_desc,
                p_in_block +
-                    in_block_desc.Get1dIndex(n_thread_data_begin,
+                    in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
                                                  c_thread_data,
                                                  hi_thread_data_begin,
                                                  wi_thread_data_begin),
-                wei_thread_block_desc,
-                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
-                out_thread_desc,
+                wei_kcyx_thread_block_desc,
+                p_wei_block +
+                    wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_nkhw_thread_desc,
                p_out_thread);
 #elif 0
            threadwise_direct_convolution_3(
-                in_thread_block_desc,
+                in_nchw_thread_block_desc,
                p_in_block +
-                    in_block_desc.Get1dIndex(n_thread_data_begin,
+                    in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
                                                  c_thread_data,
                                                  hi_thread_data_begin,
                                                  wi_thread_data_begin),
-                wei_thread_block_desc,
-                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
-                out_thread_desc,
+                wei_kcyx_thread_block_desc,
+                p_wei_block +
+                    wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_nkhw_thread_desc,
                p_out_thread);
 #endif
        }
@@ -186,13 +226,13 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_

    // copy output tensor from register to global mem
    threadwise_4d_tensor_copy(
-        out_thread_desc,
+        out_nkhw_thread_desc,
        p_out_thread,
-        out_global_desc,
+        out_nkhw_global_desc,
        p_out_global +
-            out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+            out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
                                            k_block_data_begin + k_thread_data_begin,
                                            ho_block_data_begin + ho_thread_data_begin,
                                            wo_block_data_begin + wo_thread_data_begin),
-        out_thread_desc.GetLengths());
+        out_nkhw_thread_desc.GetLengths());
 }
--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+#pragma once
+#include "common.hip.hpp"
+#include "ConstantTensorDescriptor.hip.hpp"
+#include "blockwise_2d_tensor_op.hip.hpp"
+#include "blockwise_4d_tensor_op.hip.hpp"
+#include "blockwise_direct_convolution.hip.hpp"
+#include "threadwise_4d_tensor_op.hip.hpp"
+#include "threadwise_direct_convolution.hip.hpp"
+
+template <class TInWei,
+          class TOut,
+          class TAccum,
+          class InGlobalDesc,
+          class WeiGlobalDesc,
+          class OutGlobalDesc,
+          unsigned ScalarPerVector,
+          unsigned NPerBlock,
+          unsigned KPerBlock,
+          unsigned CPerBlock,
+          unsigned HoPerBlock,
+          unsigned WoPerBlock,
+          unsigned NPerThread,
+          unsigned KPerThread,
+          unsigned CPerThread,
+          unsigned HoPerThread,
+          unsigned WoPerThread,
+          unsigned InBlockCopyDataPerRead,
+          unsigned WeiBlockCopyDataPerRead,
+          unsigned BlockSize,
+          unsigned GridSize>
+__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
+    const typename vector_type<TInWei,
+                               ScalarPerVector>::MemoryType* const __restrict__ p_in_vec_global,
+    const typename vector_type<TInWei,
+                               ScalarPerVector>::MemoryType* const __restrict__ p_wei_vec_global,
+    TOut* const __restrict__ p_out_global)
+{
+    using in_scalar_t     = TInWei;
+    using in_vector_mem_t = typename vector_type<in_scalar_t, ScalarPerVector>::MemoryType;
+    using out_scalar_t    = TOut;
+    using accum_t         = TAccum;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_nchw_vec_global_desc  = InGlobalDesc{};
+    constexpr auto wei_kcyx_vec_global_desc = WeiGlobalDesc{};
+    constexpr auto out_nkhw_global_desc     = OutGlobalDesc{};
+
+    constexpr unsigned N = in_nchw_vec_global_desc.GetLength(I0);
+    constexpr unsigned K = wei_kcyx_vec_global_desc.GetLength(I0);
+    constexpr unsigned C = wei_kcyx_vec_global_desc.GetLength(I1);
+    constexpr unsigned Y = wei_kcyx_vec_global_desc.GetLength(I2);
+    constexpr unsigned X = wei_kcyx_vec_global_desc.GetLength(I3);
+
+    constexpr auto wei_ke_vec_global_desc = make_ConstantTensorDescriptor(
+        Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy
+
+    constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
+    constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
+
+    constexpr auto in_nchw_vec_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
+
+    constexpr auto wei_ke_vec_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<KPerBlock, CPerBlock * Y * X>{},
+        Number<WeiBlockCopyDataPerRead>{}); // 2d view of wei for blockwise copy
+
+    constexpr auto wei_kcyx_vec_block_desc =
+        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{},
+                                      Sequence<wei_ke_vec_block_desc.GetStride(I0), Y * X, X, 1>{});
+
+    // shared mem
+    constexpr unsigned in_block_size =
+        in_nchw_vec_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
+
+    constexpr unsigned wei_block_size =
+        wei_kcyx_vec_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
+
+    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                       ? InBlockCopyDataPerRead
+                                       : WeiBlockCopyDataPerRead;
+
+    __shared__ in_vector_mem_t
+        p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)];
+    __shared__ in_vector_mem_t
+        p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
+
+    // threadwise tensors
+    constexpr unsigned HiPerThread = HoPerThread + Y - 1;
+    constexpr unsigned WiPerThread = WoPerThread + X - 1;
+
+    constexpr auto in_nchw_vec_thread_block_desc =
+        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
+                                      in_nchw_vec_block_desc.GetStrides());
+
+    constexpr auto wei_kcyx_vec_thread_block_desc = make_ConstantTensorDescriptor(
+        Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_vec_block_desc.GetStrides());
+
+    constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
+        in_nchw_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc);
+
+    // register
+    out_scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
+
+    // divide block work
+    constexpr unsigned NBlockWork =
+        (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr unsigned KBlockWork =
+        (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
+    constexpr unsigned HBlockWork =
+        (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
+    constexpr unsigned WBlockWork =
+        (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
+
+    const unsigned block_id = blockIdx.x;
+
+    unsigned itmp                  = block_id;
+    const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
+    itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
+    const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
+    itmp -= k_block_work_id * (HBlockWork * WBlockWork);
+    const unsigned h_block_work_id = itmp / WBlockWork;
+    const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
+
+    const unsigned n_block_data_begin  = n_block_work_id * NPerBlock;
+    const unsigned k_block_data_begin  = k_block_work_id * KPerBlock;
+    const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
+    const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
+
+    const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
+    const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
+
+    // divide thread work
+    constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
+    constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
+    constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
+    constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
+
+    const unsigned thread_id = threadIdx.x;
+
+    itmp                            = thread_id;
+    const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
+    itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork);
+    const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork);
+    itmp -= k_thread_work_id * (HThreadWork * WThreadWork);
+    const unsigned h_thread_work_id = itmp / WThreadWork;
+    const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork;
+
+    const unsigned n_thread_data_begin  = n_thread_work_id * NPerThread;
+    const unsigned k_thread_data_begin  = k_thread_work_id * KPerThread;
+    const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread;
+    const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread;
+
+    const unsigned hi_thread_data_begin = ho_thread_data_begin;
+    const unsigned wi_thread_data_begin = wo_thread_data_begin;
+
+    constexpr auto blockwise_in_copy =
+        Blockwise4dTensorCopy1<BlockSize,
+                               in_vector_mem_t,
+                               decltype(in_nchw_vec_global_desc),
+                               decltype(in_nchw_vec_block_desc),
+                               decltype(in_nchw_vec_block_desc.GetLengths()),
+                               InBlockCopyDataPerRead>{};
+
+#if 0
+    constexpr auto blockwise_wei_copy =
+        Blockwise4dTensorCopy1<BlockSize,
+                               in_vector_mem_t,
+                               decltype(wei_kcyx_vec_global_desc),
+                               decltype(wei_kcyx_vec_block_desc),
+                               decltype(wei_kcyx_vec_block_desc.GetLengths()),
+                               1>{};
+#elif 1
+    const auto blockwise_wei_copy =
+        Blockwise2dTensorCopy3<BlockSize,
+                               in_vector_mem_t,
+                               decltype(wei_ke_vec_global_desc),
+                               decltype(wei_ke_vec_block_desc),
+                               decltype(wei_ke_vec_block_desc.GetLengths()),
+                               WeiBlockCopyDataPerRead>{};
+#endif
+
+#if 1 // debug
+    // set threadwise output tensor to 0
+    threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);
+#endif
+
+    for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
+        c_block_data_begin += CPerBlock, __syncthreads())
+    {
+        // copy input tensor to LDS
+        blockwise_in_copy.Run(p_in_vec_global +
+                                  in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin,
+                                                                     c_block_data_begin,
+                                                                     hi_block_data_begin,
+                                                                     wi_block_data_begin),
+                              p_in_vec_block);
+
+        // copy weight tensor to LDS
+        blockwise_wei_copy.Run(
+            p_wei_vec_global +
+                wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
+            p_wei_vec_block);
+
+        __syncthreads();
+
+        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
+        {
+// threadwise convolution
+#if 1
+            threadwise_direct_convolution_2(
+                in_nchw_vec_thread_block_desc,
+                p_in_vec_block +
+                    in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
+                wei_kcyx_vec_thread_block_desc,
+                p_wei_vec_block +
+                    wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_nkhw_thread_desc,
+                p_out_thread);
+#elif 0
+            threadwise_direct_convolution_3(
+                in_nchw_vec_thread_block_desc,
+                p_in_vec_block +
+                    in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
+                wei_kcyx_vec_thread_block_desc,
+                p_wei_vec_block +
+                    wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_nkhw_thread_desc,
+                p_out_thread);
+#endif
+        }
+    }
+
+    // copy output tensor from register to global mem
+    threadwise_4d_tensor_copy(
+        out_nkhw_thread_desc,
+        p_out_thread,
+        out_nkhw_global_desc,
+        p_out_global +
+            out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                                            k_block_data_begin + k_thread_data_begin,
+                                            ho_block_data_begin + ho_thread_data_begin,
+                                            wo_block_data_begin + wo_thread_data_begin),
+        out_nkhw_thread_desc.GetLengths());
+}
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+#pragma once
+#include "common.hip.hpp"
+#include "ConstantTensorDescriptor.hip.hpp"
+#include "ConstantMatrixDescriptor.hip.hpp"
+#include "blockwise_4d_tensor_op.hip.hpp"
+#include "blockwise_2d_tensor_op.hip.hpp"
+#include "threadwise_2d_tensor_op.hip.hpp"
+#include "blockwise_gemm.hip.hpp"
+
+// define B = flatten(N, Hi, Wi)
+template <unsigned GridSize,
+          unsigned BlockSize,
+          class Float,
+          class InGlobalDesc,
+          class WeiGlobalDesc,
+          class OutGlobalDesc,
+          unsigned BPerBlock,
+          unsigned KPerBlock,
+          unsigned CPerBlock,
+          unsigned BPerThread,
+          unsigned KPerThread,
+          unsigned GemmThreadPerColumnPerCluster,
+          unsigned GemmThreadPerRowPerCluster,
+          unsigned GemmMPerThreadSubC,
+          unsigned GemmNPerThreadSubC,
+          unsigned GemmMLevel0Cluster,
+          unsigned GemmNLevel0Cluster,
+          unsigned GemmMLevel1Cluster,
+          unsigned GemmNLevel1Cluster,
+          unsigned GemmKPerThreadLoop,
+          unsigned InBlockCopyThreadPerDim0,
+          unsigned InBlockCopyThreadPerDim1,
+          unsigned WeiBlockCopyThreadPerDim0,
+          unsigned WeiBlockCopyThreadPerDim1,
+          unsigned InBlockCopyDataPerRead,
+          unsigned WeiBlockCopyDataPerRead>
+__global__ void
+gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restrict__ p_in_global,
+                                                    const Float* const __restrict__ p_wei_global,
+                                                    Float* const __restrict__ p_out_global)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_chwn_global_desc  = InGlobalDesc{};
+    constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
+    constexpr auto out_khwn_global_desc = OutGlobalDesc{};
+
+    constexpr unsigned C  = in_chwn_global_desc.GetLength(I0);
+    constexpr unsigned Hi = in_chwn_global_desc.GetLength(I1);
+    constexpr unsigned Wi = in_chwn_global_desc.GetLength(I2);
+    constexpr unsigned N  = in_chwn_global_desc.GetLength(I3);
+
+    constexpr unsigned K  = out_khwn_global_desc.GetLength(I0);
+    constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1);
+    constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
+
+    constexpr unsigned Y = wei_cyxk_global_desc.GetLength(I1);
+    constexpr unsigned X = wei_cyxk_global_desc.GetLength(I2);
+
+    constexpr unsigned B          = N * Hi * Wi;
+    constexpr unsigned BGhostRead = (Y - 1) * Wi + (X - 1);
+
+    // divide block work by 2d: [K, B]
+    constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
+    constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock;
+
+    const unsigned k_block_work_id = get_block_1d_id() / BBlockWork;
+    const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;
+
+    const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
+    const unsigned b_block_data_begin = b_block_work_id * BPerBlock;
+
+    // flattend (2d) tensor view of gridwise input
+    constexpr auto in_cb_global_desc  = make_ConstantTensorDescriptor(Sequence<C, B>{});
+    constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * Y * X, K>{});
+
+    // tensor view of blockwise input and weight
+    //   be careful of alignment
+    constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{});
+
+    constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<CPerBlock * Y * X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
+
+    constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
+
+    // tensor view of threadwise output in register
+    constexpr auto out_kb_thread_desc =
+        make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
+
+#if 0
+    if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
+    {
+        print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
+        print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
+        print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
+
+        print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
+        print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
+
+        print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
+        print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
+        print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
+        print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
+
+        printf("KPerBlock %u\n", KPerBlock);
+    }
+#endif
+
+// blockwise in copy
+//   formmat is [CPerBlock,BPerBlock + BGhostRead]
+#if 0
+    const auto blockwise_in_copy =
+        Blockwise2dTensorCopy1<BlockSize,
+                               Float,
+                               decltype(in_cb_global_desc),
+                               decltype(in_cb_block_desc),
+                               decltype(in_cb_block_desc.GetLengths())>{};
+#elif 0
+    const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
+                                                          Float,
+                                                          decltype(in_cb_global_desc),
+                                                          decltype(in_cb_block_desc),
+                                                          decltype(in_cb_block_desc.GetLengths()),
+                                                          InBlockCopyThreadPerDim0,
+                                                          InBlockCopyThreadPerDim1>{};
+#elif 1
+    const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
+                                                          Float,
+                                                          decltype(in_cb_global_desc),
+                                                          decltype(in_cb_block_desc),
+                                                          decltype(in_cb_block_desc.GetLengths()),
+                                                          InBlockCopyDataPerRead>{};
+#endif
+
+// blockwise wei copy
+//   format is [CPerBlock*Y*X,KPerBlock]
+#if 0
+    const auto blockwise_wei_copy =
+        Blockwise2dTensorCopy1<BlockSize,
+                               Float,
+                               decltype(wei_ek_global_desc),
+                               decltype(wei_ek_block_desc),
+                               decltype(wei_ek_block_desc.GetLengths())>{};
+#elif 0
+    const auto blockwise_wei_copy = Blockwise2dTensorCopy2<BlockSize,
+                                                           Float,
+                                                           decltype(wei_ek_global_desc),
+                                                           decltype(wei_ek_block_desc),
+                                                           decltype(wei_ek_block_desc.GetLengths()),
+                                                           WeiBlockCopyThreadPerDim0,
+                                                           WeiBlockCopyThreadPerDim1>{};
+#elif 1
+    const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
+                                                           Float,
+                                                           decltype(wei_ek_global_desc),
+                                                           decltype(wei_ek_block_desc),
+                                                           decltype(wei_ek_block_desc.GetLengths()),
+                                                           WeiBlockCopyDataPerRead>{};
+#endif
+
+    // a series of blockwise GEMM
+    // c_mtx += transpose(a_mtx) * b_mtx
+    //   a_mtx and b_mtx saved in LDS, c_mtx saved in register
+    //   a_mtx[C,K] is a sub-matrix of wei_block[C,Y,X,K]
+    //   b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
+    //   c_mtx[K,B] is out_block[K,B]
+    constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
+        Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_cyxk_block_desc.GetStride(I0)>{});
+
+    constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
+        Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
+
+    constexpr auto c_kxb_thread_mtx_desc =
+        make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
+
+    const auto blockwise_gemm =
+        BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<BlockSize,
+                                                                decltype(a_cxk_block_mtx_desc),
+                                                                decltype(b_cxb_block_mtx_desc),
+                                                                decltype(c_kxb_thread_mtx_desc),
+                                                                GemmMPerThreadSubC,
+                                                                GemmNPerThreadSubC,
+                                                                GemmMLevel0Cluster,
+                                                                GemmNLevel0Cluster,
+                                                                GemmMLevel1Cluster,
+                                                                GemmNLevel1Cluster,
+                                                                GemmKPerThreadLoop>{};
+
+    // LDS: be careful of alignment
+    constexpr unsigned in_block_size =
+        in_cb_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
+
+    constexpr unsigned wei_block_size =
+        wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
+
+    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                       ? InBlockCopyDataPerRead
+                                       : WeiBlockCopyDataPerRead;
+
+    // LDS
+    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
+    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
+
+    const Float* p_in_global_block_offset =
+        p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
+
+    const Float* p_wei_global_block_offset =
+        p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
+
+    // register
+    Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
+
+    // set threadwise output tensor to 0
+    threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
+
+    for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
+                 p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
+                 p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0),
+                 __syncthreads())
+    {
+        // load data
+        blockwise_in_copy.Run(p_in_global_block_offset, p_in_block);
+        blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block);
+
+        __syncthreads();
+
+        // compute on current data
+        //   a series of GEMM
+        for(unsigned y = 0; y < Y; ++y)
+        {
+            for(unsigned x = 0; x < X; ++x)
+            {
+                auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
+#if 1
+                blockwise_gemm.Run
+#else
+                blockwise_gemm.Run_RegisterDoubleBuffer
+#endif
+                    (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                     p_in_block + y * Wi + x,
+                     p_out_thread,
+                     f_accum);
+            }
+        }
+    }
+
+    // output: register to global mem,
+    const auto c_thread_mtx_begin =
+        blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
+
+    const unsigned k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
+    const unsigned b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;
+
+    for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
+    {
+        for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
+        {
+            const auto c_thread_mtx_distance =
+                blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b);
+
+            unsigned k_data = k_thread_data_begin + c_thread_mtx_distance.row;
+            unsigned b_data = b_thread_data_begin + c_thread_mtx_distance.col;
+
+            unsigned h_data = b_data / (Wi * N);
+            unsigned itmp   = b_data - h_data * (Wi * N);
+            unsigned w_data = itmp / N;
+            unsigned n_data = itmp - w_data * N;
+
+            if(n_data < N && h_data < Ho && w_data < Wo)
+            {
+                p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] =
+                    p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
+            }
+        }
+    }
+}
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -259,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
        __syncthreads();

 // load next data
-#if 1
+#if 0
        blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next);
        blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next);
 #elif 1
@@ -292,7 +292,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
            }
        }

-#if 0
+#if 1
        blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next);
        blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block_next);
 #endif