implicit gemm v1r3 nchw_cyxk_nkhw

a9031464 · Chao Liu · 569ad66e · a9031464 · a9031464 · a9031464
Commit a9031464 authored Apr 25, 2019 by Chao Liu
18 changed files
--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
@@ -128,7 +128,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<4, 8, 2, 2>;
    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 1; // not used yet
+    constexpr index_t InBlockReorderDataPerWrite_N = 1;

    using WeiBlockCopyClusterLengths            = Sequence<0, 0>; // not used
    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
@@ -163,7 +163,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 1, 16>;
    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 1; // not used yet
+    constexpr index_t InBlockReorderDataPerWrite_N = 2;

    using WeiBlockCopyClusterLengths            = Sequence<0, 0>; // not used
    constexpr index_t WeiBlockCopyDataPerRead_K = 4;

--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "gridwise_convolution_wrapper.hip.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp"
+
+template <class T, class InDesc, class WeiDesc, class OutDesc>
+void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
+                                                        const Tensor<T>& in_nchw,
+                                                        WeiDesc,
+                                                        const Tensor<T>& wei_kcyx,
+                                                        OutDesc,
+                                                        Tensor<T>& out_nkhw,
+                                                        index_t nrepeat)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto wei_kcyx_desc = WeiDesc{};
+    constexpr auto out_nkhw_desc = OutDesc{};
+
+    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
+    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
+
+    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
+    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
+    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
+
+    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
+    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
+    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
+    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
+
+    // reorder weight
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
+    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
+
+    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
+
+    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
+        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
+    };
+
+    make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
+        std::thread::hardware_concurrency());
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+
+#if 0
+    // for 3x3, 28x28, v1r2, Pascal
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 2>;
+    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<4, 8, 2, 2>;
+    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    constexpr index_t InBlockReorderDataPerRead_W             = 2;
+    constexpr index_t InBlockReorderDataPerWrite_N            = 4;
+
+    using WeiBlockCopyClusterLengths            = Sequence<4, 1, 32>;
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_W = 2;
+#elif 0
+    // for 3x3, 28x28, v1r3, Pascal, bad
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
+    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<4, 8, 2, 2>;
+    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
+    constexpr index_t InBlockReorderDataPerWrite_N = 1; // not used yet
+
+    using WeiBlockCopyClusterLengths            = Sequence<0, 0>; // not used
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_W = 2;
+#elif 1
+    // for 3x3, 34x34, v1r3, Pascal
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t NPerBlock  = 2;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 16;
+
+    constexpr index_t NPerThread  = 2;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 4;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<2, 1, 2, 1>;
+    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 1, 16>;
+    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
+    constexpr index_t InBlockReorderDataPerWrite_N = 1; // not used yet
+
+    using WeiBlockCopyClusterLengths            = Sequence<0, 0>; // not used
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_W = 2;
+#endif
+
+    constexpr index_t GridSize =
+        ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
+        ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw<
+            GridSize,
+            BlockSize,
+            T,
+            decltype(in_nchw_desc),
+            decltype(wei_cyxk_desc),
+            decltype(out_nkhw_desc),
+            NPerBlock,
+            KPerBlock,
+            CPerBlock,
+            HoPerBlock,
+            WoPerBlock,
+            NPerThread,
+            KPerThread,
+            HoPerThread,
+            WoPerThread,
+            GemmMPerThreadSubC,
+            GemmNPerThreadSubC,
+            GemmMLevel0Cluster,
+            GemmNLevel0Cluster,
+            GemmMLevel1Cluster,
+            GemmNLevel1Cluster,
+            GemmKPerThreadLoop,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            InBlockReorderSrcSubLengths_NCHW,
+            InBlockReorderSrcClusterLengths_NCHW,
+            InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
+            InBlockReorderDataPerRead_W,
+            InBlockReorderDataPerWrite_N,
+            WeiBlockCopyClusterLengths,
+            WeiBlockCopyDataPerRead_K,
+            OutThreadCopyDataPerWrite_W>{};
+
+        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+}
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -12,7 +12,7 @@
 //#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
 #include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
-//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
+#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"

 struct GeneratorTensor_1
@@ -605,8 +605,10 @@ int main(int argc, char* argv[])
    device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
 #elif 0
    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
-#elif 1
+#elif 0
    device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
+#elif 1
+    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
 #elif 0
    device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
 #endif

--- a/src/include/Array.hip.hpp
+++ b/src/include/Array.hip.hpp
@@ -24,7 +24,7 @@ struct Array
    {
        Array<TData, NSize + 1> new_array;

-        static_for<0, NSize, 1>{}([=](auto I) {
+        static_for<0, NSize, 1>{}([&](auto I) {
            constexpr index_t i = I.Get();
            new_array[i]        = mData[i];
        });

--- a/src/include/ConstantTensorDescriptor.hip.hpp
+++ b/src/include/ConstantTensorDescriptor.hip.hpp
@@ -137,11 +137,16 @@ struct ConstantTensorDescriptor
    }

    template <index_t... Is>
-    __host__ __device__ static constexpr index_t Get1dIndex(Sequence<Is...> multi_id)
+    __host__ __device__ static constexpr index_t Get1dIndex(Sequence<Is...> /*multi_id*/)
    {
        static_assert(sizeof...(Is) == nDim, "wrong! Dimension not consistent");

-        return Get1dIndex(Is...);
+        constexpr auto multi_id = Sequence<Is...>{};
+
+        constexpr auto seq_tmp =
+            transform_sequences(mod_conv::multiplies<index_t>{}, multi_id, GetStrides());
+
+        return accumulate_on_sequence(seq_tmp, mod_conv::plus<index_t>{}, Number<0>{});
    }

    __host__ __device__ static Array<index_t, nDim> GetMultiIndex(index_t id)

--- a/src/include/Sequence.hip.hpp
+++ b/src/include/Sequence.hip.hpp
@@ -246,7 +246,8 @@ struct accumulate_on_sequence_f
 };

 template <class Seq, class Reduce, index_t I>
-__host__ __device__ constexpr index_t accumulate_on_sequence(Seq, Reduce, Number<I>)
+__host__ __device__ constexpr index_t
+    accumulate_on_sequence(Seq, Reduce, Number<I> /*initial_value*/)
 {
    constexpr index_t a =
        static_const_reduce_n<Seq::mSize>{}(accumulate_on_sequence_f<Seq>{}, Reduce{});

--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -471,7 +471,6 @@ struct Blockwise2dTensorCopy3
                          DstDesc{}.GetStride(I0) % DataPerRead == 0,
                      "src and dst stride should be multiple of DataPerRead to keep alignment");

-        constexpr index_t L0 = CopyLengths{}.Get(I0);
        constexpr index_t L1 = CopyLengths{}.Get(I1);

        constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;

--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -761,339 +761,3 @@ struct Blockwise4dTensorCopyReorder1
            SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy);
    }
 };
-
-template <index_t BlockSize,
-          class Float,
-          class SrcDesc,
-          class DstDesc,
-          class SrcLengths,
-          class SrcSubLengths,
-          class SrcClusterLengths,
-          class MapDst2Src,
-          class MapThreadCluster2SrcCluster,
-          index_t SrcDataPerRead,
-          index_t DstDataPerWrite>
-struct Blockwise4dTensorCopyReorder3
-{
-    static constexpr index_t nDim = SrcLengths::GetSize();
-
-    index_t mSrcMyThreadOffset;
-    index_t mDstMyThreadOffset;
-
-    __device__ Blockwise4dTensorCopyReorder3()
-    {
-        constexpr auto src_desc = SrcDesc{};
-        constexpr auto dst_desc = DstDesc{};
-
-        constexpr auto src_lengths = SrcLengths{};
-
-        constexpr auto map_dst2src = MapDst2Src{};
-
-        constexpr auto src_sub_lengths = SrcSubLengths{};
-        constexpr auto dst_sub_lengths = src_sub_lengths.ReorderGivenNew2Old(map_dst2src);
-
-        constexpr auto map_thread_cluster_2_src_cluster = MapThreadCluster2SrcCluster{};
-
-        constexpr auto src_cluster_lengths = SrcClusterLengths{};
-        constexpr auto thread_cluster_lengths =
-            src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
-
-        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(thread_cluster_lengths);
-
-        // sanity check: data type
-        static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
-
-        // sanity check: nDim
-        static_assert(SrcDesc::GetDimension() == nDim && DstDesc::GetDimension() == nDim &&
-                          SrcLengths::GetSize() == nDim && SrcSubLengths::GetSize() == nDim &&
-                          SrcClusterLengths::GetSize() == nDim && MapDst2Src::GetSize() == nDim &&
-                          MapThreadCluster2SrcCluster::GetSize() == nDim,
-                      "wrong! nDim is not consistent\n");
-
-        // sanity check: BlockSize
-        constexpr index_t num_active_thread = thread_cluster_desc.GetElementSize();
-
-        static_assert(BlockSize >= num_active_thread,
-                      "wrong! BlockSize is not big enough for ThreadPerDims!");
-
-        // sanity check: work division
-        static_for<0, nDim, 1>{}([](auto IDim) {
-            constexpr auto I                  = decltype(IDim){};
-            constexpr index_t src_len         = src_lengths.Get(I);
-            constexpr index_t src_sub_len     = src_sub_lengths.Get(I);
-            constexpr index_t src_cluster_len = src_cluster_lengths.Get(I);
-            static_assert(src_len % (src_sub_len * src_cluster_len) == 0,
-                          "wrong! cannot evenly divide Src tensor lengths");
-        });
-
-        // sanity check: src read
-        static_assert(SrcDataPerRead == 1 || SrcDataPerRead == 2 || SrcDataPerRead == 4,
-                      "wrong! only support SrcDataPerRead == 1, 2 or 4!\n");
-
-        static_assert(SrcDataPerRead == 1 || src_desc.GetStride(Number<nDim - 1>{}) == 1,
-                      "wrong! only support src.stride(nDim-1) == 1 if SrcDataPerRead > 1!\n");
-
-        static_assert(src_sub_lengths.Get(Number<nDim - 1>{}) % SrcDataPerRead == 0,
-                      "wrong! src_sub_lengths[nDim-1] % SrcDataPerRead != 0\n");
-
-        static_assert(src_desc.GetStride(Number<nDim - 2>{}) % SrcDataPerRead == 0,
-                      "wrong! should satisfy src_desc.stride(nDim-2) % SrcDataPerRead == 0, to "
-                      "keep alignment");
-
-        // sanity check: dst write
-        static_assert(DstDataPerWrite == 1 || DstDataPerWrite == 2 || DstDataPerWrite == 4,
-                      "wrong! only support DstDataPerWrite == 1, 2 or 4!\n");
-
-        static_assert(DstDataPerWrite == 1 || dst_desc.GetStride(Number<nDim - 1>{}) == 1,
-                      "wrong! only support dst.stride(nDim-1) == 1 if DstDataPerWrite > 1!\n");
-
-        static_assert(dst_sub_lengths.Get(Number<nDim - 1>{}) % DstDataPerWrite == 0,
-                      "wrong! dst_sub_lengths[nDim-1] % DstDataPerWrite != 0\n");
-
-        static_assert(dst_desc.GetStride(Number<nDim - 2>{}) % DstDataPerWrite == 0,
-                      "wrong! should satisfy dst_desc.stride(nDim-2) % DstDataPerWrite == 0, to "
-                      "keep alignment");
-
-        // start dividing work
-        if(BlockSize > num_active_thread)
-        {
-            if(get_thread_local_1d_id() >= num_active_thread)
-            {
-                return;
-            }
-        }
-
-        const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id());
-
-        // compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate
-        // regsiters, or only one copy???
-        auto src_data_multi_id =
-            reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster);
-
-        static_for<0, nDim, 1>{}([&](auto IDim) {
-            constexpr auto I    = decltype(IDim){};
-            constexpr index_t i = I.Get();
-            // compiler: will it really compute index here, or be associated with Get1dIndex and
-            // optimized away???
-            src_data_multi_id[i] *= src_sub_lengths.Get(I);
-        });
-
-        // compiler: will it really compute index here, or be associated with Get1dIndex and
-        // optimized away???
-        const auto dst_data_multi_id = reorder_array_given_new2old(src_data_multi_id, map_dst2src);
-
-        mSrcMyThreadOffset = src_desc.Get1dIndex(src_data_multi_id);
-        mDstMyThreadOffset = dst_desc.Get1dIndex(dst_data_multi_id);
-
-#if 0
-        if(get_block_1d_id() == 0)
-        {
-            printf("tid %5u, "
-                   "thread_multi_id %5u %5u %5u %5u, "
-                   "src_data_multi_id %5u %5u %5u %5u, "
-                   "dst_data_multi_id %5u %5u %5u %5u, "
-                   "mSrcMyThreadOffset %u, mDstMyThreadOffset %u\n",
-                   get_thread_local_1d_id(),
-                   thread_multi_id[0],
-                   thread_multi_id[1],
-                   thread_multi_id[2],
-                   thread_multi_id[3],
-                   src_data_multi_id[0],
-                   src_data_multi_id[1],
-                   src_data_multi_id[2],
-                   src_data_multi_id[3],
-                   dst_data_multi_id[0],
-                   dst_data_multi_id[1],
-                   dst_data_multi_id[2],
-                   dst_data_multi_id[3],
-                   mSrcMyThreadOffset,
-                   mDstMyThreadOffset);
-        }
-#endif
-    }
-
-    __device__ static constexpr index_t GetRegisterClipboardSize()
-    {
-        constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
-
-        constexpr auto src_data_per_cluster_per_dims = transform_sequences(
-            mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
-
-        constexpr auto cluster_per_dims =
-            transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
-                                SrcLengths{},
-                                src_data_per_cluster_per_dims);
-
-        constexpr auto thread_tensor_lengths = transform_sequences(
-            mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, cluster_per_dims);
-
-        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
-
-        return thread_tensor_desc.GetElementSpace();
-    }
-
-    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
-                                             Float* __restrict__ p_clipboard) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
-
-        constexpr auto src_data_per_cluster_per_dims = transform_sequences(
-            mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
-
-        constexpr auto cluster_per_dims =
-            transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
-                                SrcLengths{},
-                                src_data_per_cluster_per_dims);
-
-        constexpr auto thread_tensor_lengths = transform_sequences(
-            mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, cluster_per_dims);
-
-        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
-
-        constexpr auto thread_sub_tensor_desc =
-            make_ConstantTensorDescriptor(SrcClusterLengths{}, thread_tensor_desc.GetStrides());
-
-#if 1
-        for(index_t icluster_d0 = 0; icluster_d0 < cluster_per_dims.Get(I0); ++icluster_d0)
-        {
-            for(index_t icluster_d1 = 0; icluster_d1 < cluster_per_dims.Get(I1); ++icluster_d1)
-            {
-                for(index_t icluster_d2 = 0; icluster_d2 < cluster_per_dims.Get(I2); ++icluster_d2)
-                {
-                    for(index_t icluster_d3 = 0; icluster_d3 < cluster_per_dims.Get(I3);
-                        ++icluster_d3)
-                    {
-                        const index_t src_offset = SrcDesc{}.Get1dIndex(
-                            icluster_d0 * src_data_per_cluster_per_dims.Get(I0),
-                            icluster_d1 * src_data_per_cluster_per_dims.Get(I1),
-                            icluster_d2 * src_data_per_cluster_per_dims.Get(I2),
-                            icluster_d3 * src_data_per_cluster_per_dims.Get(I3));
-
-                        const index_t clipboard_offset = thread_tensor_desc.Get1dIndex(
-                            icluster_d0 * thread_sub_tensor_lengths.Get(I0),
-                            icluster_d1 * thread_sub_tensor_lengths.Get(I1),
-                            icluster_d2 * thread_sub_tensor_lengths.Get(I2),
-                            icluster_d3 * thread_sub_tensor_lengths.Get(I3));
-
-                        threadwise_nd_tensor_copy(SrcDesc{},
-                                                  p_src + src_offset + mSrcMyThreadOffset,
-                                                  thread_tensor_desc,
-                                                  p_clipboard + clipboard_offset,
-                                                  thread_sub_tensor_lengths,
-                                                  Number<SrcDataPerRead>{});
-                    }
-                }
-            }
-        }
-#else
-        static_ford<decltype(cluster_per_dims)>{}([=](auto cluster_ids) {
-
-        });
-#endif
-
-#if 0
-        if(get_block_1d_id() == 0)
-        {
-            printf("tid %5u, "
-                   "data: %f %f %f %f %f %f %f %f\n",
-                   get_thread_local_1d_id(),
-                   p_clipboard[0],
-                   p_clipboard[1],
-                   p_clipboard[2],
-                   p_clipboard[3],
-                   p_clipboard[4],
-                   p_clipboard[5],
-                   p_clipboard[6],
-                   p_clipboard[7]);
-        }
-#endif
-    }
-
-    __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
-                                              Float* __restrict__ p_dst) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
-
-        constexpr auto src_data_per_cluster_per_dims = transform_sequences(
-            mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
-
-        constexpr auto cluster_per_dims =
-            transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
-                                SrcLengths{},
-                                src_data_per_cluster_per_dims);
-
-        constexpr auto thread_tensor_lengths = transform_sequences(
-            mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, cluster_per_dims);
-
-        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
-
-        constexpr auto thread_sub_tensor_desc =
-            make_ConstantTensorDescriptor(SrcClusterLengths{}, thread_tensor_desc.GetStrides());
-
-        for(index_t icluster_d0 = 0; icluster_d0 < cluster_per_dims.Get(I0); ++icluster_d0)
-        {
-            for(index_t icluster_d1 = 0; icluster_d1 < cluster_per_dims.Get(I1); ++icluster_d1)
-            {
-                for(index_t icluster_d2 = 0; icluster_d2 < cluster_per_dims.Get(I2); ++icluster_d2)
-                {
-                    for(index_t icluster_d3 = 0; icluster_d3 < cluster_per_dims.Get(I3);
-                        ++icluster_d3)
-                    {
-                        const index_t clipboard_offset = thread_tensor_desc.Get1dIndex(
-                            icluster_d0 * thread_sub_tensor_lengths.Get(I0),
-                            icluster_d1 * thread_sub_tensor_lengths.Get(I1),
-                            icluster_d2 * thread_sub_tensor_lengths.Get(I2),
-                            icluster_d3 * thread_sub_tensor_lengths.Get(I3));
-
-                        const auto dst_multi_id = reorder_array_given_new2old(
-                            Array<index_t, nDim>{
-                                icluster_d0 * src_data_per_cluster_per_dims.Get(I0),
-                                icluster_d1 * src_data_per_cluster_per_dims.Get(I1),
-                                icluster_d2 * src_data_per_cluster_per_dims.Get(I2),
-                                icluster_d3 * src_data_per_cluster_per_dims.Get(I3)},
-                            MapDst2Src{});
-
-                        const index_t dst_offset = DstDesc{}.Get1dIndex(dst_multi_id);
-
-#if 0
-                        if(get_block_1d_id() == 0)
-                        {
-                            printf("tid %5u, "
-                                    "clipboard_offsetm %5u, dst_offset %5u\n",
-                            get_thread_local_1d_id(),
-                            clipboard_offset,
-                            dst_offset);
-                        }
-#endif
-
-                        threadwise_4d_tensor_copy_reorder_given_dst2src_v2(
-                            thread_tensor_desc,
-                            p_clipboard + clipboard_offset,
-                            DstDesc{},
-                            p_dst + dst_offset + mDstMyThreadOffset,
-                            thread_sub_tensor_lengths,
-                            MapDst2Src{});
-                    }
-                }
-            }
-        }
-    }
-
-    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
-    {
-        Float p_clipboard[GetRegisterClipboardSize()];
-
-        RunLoadRegisterClipboard(p_src, p_clipboard);
-        RunStoreRegisterClipboard(p_clipboard, p_dst);
-    }
-};
--- a/src/include/blockwise_batched_gemm.hip.hpp
+++ b/src/include/blockwise_batched_gemm.hip.hpp
@@ -53,7 +53,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2

        constexpr index_t M = a_block_mtx.NCol(); // A is transposed
        constexpr index_t N = b_block_mtx.NCol();
-        constexpr index_t K = a_block_mtx.NRow();

        constexpr index_t MPerThread = c_thread_mtx.NRow();
        constexpr index_t NPerThread = c_thread_mtx.NCol();
@@ -114,8 +113,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2

    __device__ MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) const
    {
-        constexpr index_t BatchThreadWork = BatchSize / BatchPerThread;
-
        constexpr index_t ThreadPerLevel1Cluster =
            MLevel0Cluster * NLevel0Cluster * MLevel1Cluster * NLevel1Cluster;


--- a/src/include/blockwise_nd_tensor_op.hip.hpp
+++ b/src/include/blockwise_nd_tensor_op.hip.hpp
+#pragma once
+#include "threadwise_nd_tensor_op.hip.hpp"
+
+template <index_t BlockSize,
+          class Float,
+          class SrcDesc,
+          class DstDesc,
+          class SrcLengths,
+          class SrcSubLengths,
+          class SrcClusterLengths,
+          class MapDst2Src,
+          class MapThreadCluster2SrcCluster,
+          index_t SrcDataPerRead,
+          index_t DstDataPerWrite>
+struct BlockwiseNdTensorCopyReorder_v3
+{
+    static constexpr index_t nDim = SrcLengths::GetSize();
+
+    index_t mSrcMyThreadOffset;
+    index_t mDstMyThreadOffset;
+
+    __device__ BlockwiseNdTensorCopyReorder_v3()
+    {
+        constexpr auto src_desc = SrcDesc{};
+        constexpr auto dst_desc = DstDesc{};
+
+        constexpr auto src_lengths = SrcLengths{};
+
+        constexpr auto map_dst2src = MapDst2Src{};
+
+        constexpr auto src_sub_lengths = SrcSubLengths{};
+        constexpr auto dst_sub_lengths = src_sub_lengths.ReorderGivenNew2Old(map_dst2src);
+
+        constexpr auto map_thread_cluster_2_src_cluster = MapThreadCluster2SrcCluster{};
+
+        constexpr auto src_cluster_lengths = SrcClusterLengths{};
+        constexpr auto thread_cluster_lengths =
+            src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
+
+        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(thread_cluster_lengths);
+
+        // sanity check: data type
+        static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
+
+        // sanity check: nDim
+        static_assert(SrcDesc::GetDimension() == nDim && DstDesc::GetDimension() == nDim &&
+                          SrcLengths::GetSize() == nDim && SrcSubLengths::GetSize() == nDim &&
+                          SrcClusterLengths::GetSize() == nDim && MapDst2Src::GetSize() == nDim &&
+                          MapThreadCluster2SrcCluster::GetSize() == nDim,
+                      "wrong! nDim is not consistent\n");
+
+        // sanity check: BlockSize
+        constexpr index_t num_active_thread = thread_cluster_desc.GetElementSize();
+
+        static_assert(BlockSize >= num_active_thread,
+                      "wrong! BlockSize is not big enough for ThreadPerDims!");
+
+        // sanity check: work division
+        static_for<0, nDim, 1>{}([](auto IDim) {
+            constexpr auto I                  = decltype(IDim){};
+            constexpr index_t src_len         = src_lengths.Get(I);
+            constexpr index_t src_sub_len     = src_sub_lengths.Get(I);
+            constexpr index_t src_cluster_len = src_cluster_lengths.Get(I);
+            static_assert(src_len % (src_sub_len * src_cluster_len) == 0,
+                          "wrong! cannot evenly divide Src tensor lengths");
+        });
+
+        // sanity check: src read
+        static_assert(SrcDataPerRead == 1 || SrcDataPerRead == 2 || SrcDataPerRead == 4,
+                      "wrong! only support SrcDataPerRead == 1, 2 or 4!\n");
+
+        static_assert(SrcDataPerRead == 1 || src_desc.GetStride(Number<nDim - 1>{}) == 1,
+                      "wrong! only support src.stride(nDim-1) == 1 if SrcDataPerRead > 1!\n");
+
+        static_assert(src_sub_lengths.Get(Number<nDim - 1>{}) % SrcDataPerRead == 0,
+                      "wrong! src_sub_lengths[nDim-1] % SrcDataPerRead != 0\n");
+
+        static_assert(src_desc.GetStride(Number<nDim - 2>{}) % SrcDataPerRead == 0,
+                      "wrong! should satisfy src_desc.stride(nDim-2) % SrcDataPerRead == 0, to "
+                      "keep alignment");
+
+        // sanity check: dst write
+        static_assert(DstDataPerWrite == 1 || DstDataPerWrite == 2 || DstDataPerWrite == 4,
+                      "wrong! only support DstDataPerWrite == 1, 2 or 4!\n");
+
+        static_assert(DstDataPerWrite == 1 || dst_desc.GetStride(Number<nDim - 1>{}) == 1,
+                      "wrong! only support dst.stride(nDim-1) == 1 if DstDataPerWrite > 1!\n");
+
+        static_assert(dst_sub_lengths.Get(Number<nDim - 1>{}) % DstDataPerWrite == 0,
+                      "wrong! dst_sub_lengths[nDim-1] % DstDataPerWrite != 0\n");
+
+        static_assert(dst_desc.GetStride(Number<nDim - 2>{}) % DstDataPerWrite == 0,
+                      "wrong! should satisfy dst_desc.stride(nDim-2) % DstDataPerWrite == 0, to "
+                      "keep alignment");
+
+        // start dividing work
+        if(BlockSize > num_active_thread)
+        {
+            if(get_thread_local_1d_id() >= num_active_thread)
+            {
+                return;
+            }
+        }
+
+        const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id());
+
+        // compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate
+        // regsiters, or only one copy???
+        auto src_data_multi_id =
+            reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster);
+
+        static_for<0, nDim, 1>{}([&](auto IDim) {
+            constexpr auto I    = decltype(IDim){};
+            constexpr index_t i = I.Get();
+            // compiler: will it really compute index here, or be associated with Get1dIndex and
+            // optimized away???
+            src_data_multi_id[i] *= src_sub_lengths.Get(I);
+        });
+
+        // compiler: will it really compute index here, or be associated with Get1dIndex and
+        // optimized away???
+        const auto dst_data_multi_id = reorder_array_given_new2old(src_data_multi_id, map_dst2src);
+
+        mSrcMyThreadOffset = src_desc.Get1dIndex(src_data_multi_id);
+        mDstMyThreadOffset = dst_desc.Get1dIndex(dst_data_multi_id);
+    }
+
+    __device__ static constexpr index_t GetRegisterClipboardSize()
+    {
+        constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
+
+        constexpr auto src_data_per_cluster_per_dims = transform_sequences(
+            mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
+
+        constexpr auto repeat_lengths =
+            transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
+                                SrcLengths{},
+                                src_data_per_cluster_per_dims);
+
+        constexpr auto thread_tensor_lengths = transform_sequences(
+            mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, repeat_lengths);
+
+        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
+
+        return thread_tensor_desc.GetElementSpace();
+    }
+
+    __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
+                                             Float* __restrict__ p_clipboard) const
+    {
+        constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
+
+        constexpr auto src_data_per_cluster_per_dims = transform_sequences(
+            mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
+
+        constexpr auto repeat_lengths =
+            transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
+                                SrcLengths{},
+                                src_data_per_cluster_per_dims);
+
+        constexpr auto thread_tensor_lengths = transform_sequences(
+            mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, repeat_lengths);
+
+        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
+
+        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
+            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
+
+            constexpr auto src_data_multi_id = transform_sequences(
+                mod_conv::multiplies<index_t>{}, repeat_multi_id, src_data_per_cluster_per_dims);
+
+            constexpr auto clipboard_data_multi_id = transform_sequences(
+                mod_conv::multiplies<index_t>{}, repeat_multi_id, thread_sub_tensor_lengths);
+
+            constexpr index_t src_offset = SrcDesc{}.Get1dIndex(src_data_multi_id);
+            constexpr index_t clipboard_offset =
+                thread_tensor_desc.Get1dIndex(clipboard_data_multi_id);
+
+            threadwise_nd_tensor_copy(SrcDesc{},
+                                      p_src + src_offset + mSrcMyThreadOffset,
+                                      thread_tensor_desc,
+                                      p_clipboard + clipboard_offset,
+                                      thread_sub_tensor_lengths,
+                                      Number<SrcDataPerRead>{});
+        });
+    }
+
+    __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
+                                              Float* __restrict__ p_dst) const
+    {
+        constexpr auto thread_sub_tensor_lengths = SrcSubLengths{};
+
+        constexpr auto src_data_per_cluster_per_dims = transform_sequences(
+            mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, SrcClusterLengths{});
+
+        constexpr auto repeat_lengths =
+            transform_sequences(mod_conv::integer_divide_ceiler<index_t>{},
+                                SrcLengths{},
+                                src_data_per_cluster_per_dims);
+
+        constexpr auto thread_tensor_lengths = transform_sequences(
+            mod_conv::multiplies<index_t>{}, thread_sub_tensor_lengths, repeat_lengths);
+
+        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths);
+
+        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
+            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
+
+            constexpr auto clipboard_data_multi_id = transform_sequences(
+                mod_conv::multiplies<index_t>{}, repeat_multi_id, thread_sub_tensor_lengths);
+
+            constexpr auto src_data_multi_id = transform_sequences(
+                mod_conv::multiplies<index_t>{}, repeat_multi_id, src_data_per_cluster_per_dims);
+
+            // reorder src_data_multi_id to get dst_data_multi_id
+            constexpr auto dst_data_multi_id = src_data_multi_id.ReorderGivenNew2Old(MapDst2Src{});
+
+            constexpr index_t clipboard_offset =
+                thread_tensor_desc.Get1dIndex(clipboard_data_multi_id);
+
+            constexpr index_t dst_offset = DstDesc{}.Get1dIndex(dst_data_multi_id);
+
+            // write in the order of dst
+#if 1
+            threadwise_nd_tensor_copy_reorder_given_dst2src_v2(thread_tensor_desc,
+                                                               p_clipboard + clipboard_offset,
+                                                               DstDesc{},
+                                                               p_dst + dst_offset +
+                                                                   mDstMyThreadOffset,
+                                                               thread_sub_tensor_lengths,
+                                                               MapDst2Src{});
+#else
+            threadwise_nd_tensor_copy_reorder_given_dst2src_v3(thread_tensor_desc,
+                                                               p_clipboard + clipboard_offset,
+                                                               DstDesc{},
+                                                               p_dst + dst_offset +
+                                                                   mDstMyThreadOffset,
+                                                               thread_sub_tensor_lengths,
+                                                               MapDst2Src{},
+                                                               Number<DstDataPerWrite>{});
+#endif
+        });
+    }
+
+    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
+    {
+        Float p_clipboard[GetRegisterClipboardSize()];
+
+        RunLoadRegisterClipboard(p_src, p_clipboard);
+        RunStoreRegisterClipboard(p_clipboard, p_dst);
+    }
+};
--- a/src/include/conv_common.hip.hpp
+++ b/src/include/conv_common.hip.hpp
@@ -73,7 +73,6 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
 template <class InDesc, class WeiDesc, class OutDesc>
 __host__ __device__ constexpr std::size_t calculate_convolution_flops(InDesc, WeiDesc, OutDesc)
 {
-    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
    constexpr auto out_desc = OutDesc{};


--- a/src/include/data_type.hip.hpp
+++ b/src/include/data_type.hip.hpp
 #pragma once
 #include "config.h"
+#include "constant_integral.hip.hpp"

 template <class T, index_t N>
 struct vector_type
@@ -10,6 +11,13 @@ template <>
 struct vector_type<float, 1>
 {
    typedef float MemoryType;
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    {
+        static_assert(I < 1, "wrong");
+        *(reinterpret_cast<float*>(&v) + I) = s;
+    }
 };

 template <>
@@ -20,21 +28,29 @@ struct vector_type<float, 2>
    // instruction
    typedef float MemoryType __attribute__((ext_vector_type(2)));
 #elif DEVICE_BACKEND_CUDA
-    // For some reason, CUDA need this definition to, otherwise
+    // For some reason, CUDA need this definition, otherwise
    //   compiler won't generate optimal load and store instruction, and
    //   kernel would produce wrong result, indicating the compiler fail to generate correct
    //   instruction,
    using MemoryType = float2;
 #endif

-    __host__ __device__ static MemoryType Pack(float s0, float s1)
+    union Data
    {
-        union
-        {
-            MemoryType vector;
-            float scalar[2];
-        } data;
+        MemoryType vector;
+        float scalar[2];
+    };

+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    {
+        static_assert(I < 2, "wrong");
+        *(reinterpret_cast<float*>(&v) + I) = s;
+    }
+
+    __host__ __device__ static MemoryType Pack(float s0, float s1)
+    {
+        Data data;
        data.scalar[0] = s0;
        data.scalar[1] = s1;
        return data.vector;
@@ -49,12 +65,19 @@ struct vector_type<float, 4>
    // instruction
    typedef float MemoryType __attribute__((ext_vector_type(4)));
 #elif DEVICE_BACKEND_CUDA
-    // For some reason, CUDA need this definition to, otherwise
+    // For some reason, CUDA need this definition, otherwise
    //   compiler won't generate optimal load and store instruction, and
    //   kernel would produce wrong result, indicating the compiler fail to generate correct
    //   instruction,
    using MemoryType = float4;
 #endif
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    {
+        static_assert(I < 4, "wrong");
+        *(reinterpret_cast<float*>(&v) + I) = s;
+    }
 };

 #if 0

--- a/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
@@ -4,7 +4,7 @@
 #include "ConstantMatrixDescriptor.hip.hpp"
 #include "blockwise_2d_tensor_op.hip.hpp"
 #include "blockwise_3d_tensor_op.hip.hpp"
-#include "blockwise_4d_tensor_op.hip.hpp"
+#include "blockwise_nd_tensor_op.hip.hpp"
 #include "threadwise_nd_tensor_op.hip.hpp"
 #include "threadwise_4d_tensor_op.hip.hpp"
 #include "blockwise_batched_gemm.hip.hpp"
@@ -125,17 +125,17 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
        constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{};

        const auto blockwise_in_copy_reorder =
-            Blockwise4dTensorCopyReorder3<BlockSize,
-                                          Float,
-                                          decltype(in_n_c_h_w_global_desc),
-                                          decltype(in_c_h_w_n_block_desc),
-                                          Sequence<NPerBlock, CPerBlock, HoPerBlock, WiPerBlock>,
-                                          InBlockReorderSrcSubLengths_NCHW,
-                                          InBlockReorderSrcClusterLengths_NCHW,
-                                          decltype(map_chwn2nchw),
-                                          InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
-                                          InBlockReorderDataPerRead_W,
-                                          InBlockReorderDataPerWrite_N>{};
+            BlockwiseNdTensorCopyReorder_v3<BlockSize,
+                                            Float,
+                                            decltype(in_n_c_h_w_global_desc),
+                                            decltype(in_c_h_w_n_block_desc),
+                                            Sequence<NPerBlock, CPerBlock, HoPerBlock, WiPerBlock>,
+                                            InBlockReorderSrcSubLengths_NCHW,
+                                            InBlockReorderSrcClusterLengths_NCHW,
+                                            decltype(map_chwn2nchw),
+                                            InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
+                                            InBlockReorderDataPerRead_W,
+                                            InBlockReorderDataPerWrite_N>{};

        // blockwise wei copy
        //   format is [CPerBlock, X * KPerBlock]

--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
@@ -3,7 +3,7 @@
 #include "ConstantTensorDescriptor.hip.hpp"
 #include "ConstantMatrixDescriptor.hip.hpp"
 #include "blockwise_2d_tensor_op.hip.hpp"
-#include "blockwise_4d_tensor_op.hip.hpp"
+#include "blockwise_nd_tensor_op.hip.hpp"
 #include "threadwise_nd_tensor_op.hip.hpp"
 #include "threadwise_4d_tensor_op.hip.hpp"
 #include "blockwise_batched_gemm.hip.hpp"
@@ -133,17 +133,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
        constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{};

        const auto blockwise_in_copy_reorder =
-            Blockwise4dTensorCopyReorder3<BlockSize,
-                                          Float,
-                                          decltype(in_n_c_h_w_global_desc),
-                                          decltype(in_c_h_w_n_block_desc),
-                                          Sequence<NPerBlock, CPerBlock, HoPerBlock, WoPerBlock>,
-                                          InBlockReorderSrcSubLengths_NCHW,
-                                          InBlockReorderSrcClusterLengths_NCHW,
-                                          decltype(map_chwn2nchw),
-                                          InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
-                                          InBlockReorderDataPerRead_W,
-                                          InBlockReorderDataPerWrite_N>{};
+            BlockwiseNdTensorCopyReorder_v3<BlockSize,
+                                            Float,
+                                            decltype(in_n_c_h_w_global_desc),
+                                            decltype(in_c_h_w_n_block_desc),
+                                            Sequence<NPerBlock, CPerBlock, HoPerBlock, WoPerBlock>,
+                                            InBlockReorderSrcSubLengths_NCHW,
+                                            InBlockReorderSrcClusterLengths_NCHW,
+                                            decltype(map_chwn2nchw),
+                                            InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
+                                            InBlockReorderDataPerRead_W,
+                                            InBlockReorderDataPerWrite_N>{};

        // blockwise wei copy
        //   format is [CPerBlock, KPerBlock]

--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
@@ -3,7 +3,7 @@
 #include "ConstantTensorDescriptor.hip.hpp"
 #include "ConstantMatrixDescriptor.hip.hpp"
 #include "blockwise_2d_tensor_op.hip.hpp"
-#include "blockwise_4d_tensor_op.hip.hpp"
+#include "blockwise_nd_tensor_op.hip.hpp"
 #include "threadwise_nd_tensor_op.hip.hpp"
 #include "threadwise_4d_tensor_op.hip.hpp"
 #include "blockwise_batched_gemm.hip.hpp"
@@ -130,17 +130,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
        constexpr auto map_chwn2nchw = Sequence<1, 2, 3, 0>{};

        const auto blockwise_in_copy_reorder =
-            Blockwise4dTensorCopyReorder3<BlockSize,
-                                          Float,
-                                          decltype(in_n_c_h_w_global_desc),
-                                          decltype(in_c_h_w_n_block_desc),
-                                          Sequence<NPerBlock, CPerBlock, HoPerBlock, WoPerBlock>,
-                                          InBlockReorderSrcSubLengths_NCHW,
-                                          InBlockReorderSrcClusterLengths_NCHW,
-                                          decltype(map_chwn2nchw),
-                                          InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
-                                          InBlockReorderDataPerRead_W,
-                                          InBlockReorderDataPerWrite_N>{};
+            BlockwiseNdTensorCopyReorder_v3<BlockSize,
+                                            Float,
+                                            decltype(in_n_c_h_w_global_desc),
+                                            decltype(in_c_h_w_n_block_desc),
+                                            Sequence<NPerBlock, CPerBlock, HoPerBlock, WoPerBlock>,
+                                            InBlockReorderSrcSubLengths_NCHW,
+                                            InBlockReorderSrcClusterLengths_NCHW,
+                                            decltype(map_chwn2nchw),
+                                            InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
+                                            InBlockReorderDataPerRead_W,
+                                            InBlockReorderDataPerWrite_N>{};

        // blockwise wei copy
        //   format is [CPerBlock, KPerBlock]

--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
--- a/src/include/threadwise_4d_tensor_op.hip.hpp
+++ b/src/include/threadwise_4d_tensor_op.hip.hpp
@@ -139,135 +139,6 @@ __device__ void threadwise_4d_tensor_copy_reorder_given_dst2src(SrcDesc,
        SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, MapDst2Src{}, f_copy);
 }

-#if 0 // replaced threadwise_nd_tensor_copy
-template <class SrcData, class DstData, class SrcDesc, class DstDesc, class SrcOpLengths>
-__device__ void threadwise_4d_tensor_copy(
-    SrcDesc, const SrcData* __restrict__ p_src, DstDesc, DstData* __restrict__ p_dst, SrcOpLengths)
-{
-    auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};
-
-    threadwise_4d_tensor_copy_reorder_given_dst2src(
-        SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder);
-}
-
-// need to assume src and dst is aligned
-template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
-__device__ void threadwise_4d_tensor_copy_v2(SrcDesc,
-                                             const Float* __restrict__ p_src,
-                                             DstDesc,
-                                             Float* __restrict__ p_dst,
-                                             SrcOpLengths,
-                                             Number<DataPerRead>)
-{
-    static_assert(SrcDesc{}.GetDimension() == 4 && DstDesc{}.GetDimension() == 4 &&
-                      SrcOpLengths::GetSize() == 4,
-                  "wrong! should be 4 dimension");
-
-    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-    constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{});
-
-    static_assert(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1,
-                  "wrong! only support stride3 == 1!\n");
-
-    static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
-                  "wrong! only support DataPerRead == 1, 2 or 4!\n");
-
-    static_assert(SrcDesc{}.GetStride(I2) % DataPerRead == 0 &&
-                      DstDesc{}.GetStride(I2) % DataPerRead == 0,
-                  "wrong! src and dst stride should be multiple of DataPerRead to keep alignment");
-
-    constexpr index_t L3 = SrcOpLengths{}.Get(I3);
-
-    static_assert(L3 % DataPerRead == 0, "wrong! L3 should be evenly divided by DataPerRead");
-
-    constexpr index_t nloop_d3 = L3 / DataPerRead;
-
-    for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
-    {
-        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
-        {
-            for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
-            {
-                for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3)
-                {
-                    const index_t src_index =
-                        src_desc.Get1dIndex(did0, did1, did2, iloop_d3 * DataPerRead);
-
-                    const index_t dst_index =
-                        dst_desc.Get1dIndex(did0, did1, did2, iloop_d3 * DataPerRead);
-
-                    *(reinterpret_cast<vector_t*>(&p_dst[dst_index])) =
-                        *(reinterpret_cast<const vector_t*>(&p_src[src_index]));
-                }
-            }
-        }
-    }
-}
-#endif
-
-template <class SrcData,
-          class DstData,
-          class SrcDesc,
-          class DstDesc,
-          class SrcOpLengths,
-          class MapDst2Src>
-__device__ void
-threadwise_4d_tensor_copy_reorder_given_dst2src_v2(SrcDesc,
-                                                   const SrcData* __restrict__ p_src,
-                                                   DstDesc,
-                                                   DstData* __restrict__ p_dst,
-                                                   SrcOpLengths,
-                                                   MapDst2Src)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr index_t IR0 = MapDst2Src{}.Get(I0);
-    constexpr index_t IR1 = MapDst2Src{}.Get(I1);
-    constexpr index_t IR2 = MapDst2Src{}.Get(I2);
-    constexpr index_t IR3 = MapDst2Src{}.Get(I3);
-
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-
-    // ref_desc has dst_desc's ordering
-    constexpr auto ref_desc =
-        make_ConstantTensorDescriptor(SrcOpLengths{}.ReorderGivenNew2Old(MapDst2Src{}));
-
-    for(index_t did0 = 0; did0 < ref_desc.GetLength(I0); ++did0)
-    {
-        for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1)
-        {
-            for(index_t did2 = 0; did2 < ref_desc.GetLength(I2); ++did2)
-            {
-                for(index_t did3 = 0; did3 < ref_desc.GetLength(I3); ++did3)
-                {
-                    const auto dst_multi_id = Array<index_t, 4>{did0, did1, did2, did3};
-
-                    const auto src_multi_id =
-                        reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
-
-                    const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
-
-                    const index_t src_index = src_desc.Get1dIndex(src_multi_id);
-
-                    p_dst[dst_index] = p_src[src_index];
-                }
-            }
-        }
-    }
-}
-
 template <class Float, class Desc, class IDim, class NShift>
 __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDim, NShift)
 {

--- a/src/include/threadwise_nd_tensor_op.hip.hpp
+++ b/src/include/threadwise_nd_tensor_op.hip.hpp
@@ -50,7 +50,7 @@ __device__ void threadwise_nd_tensor_copy(SrcDesc,
    constexpr index_t nRead = L_Back / DataPerRead;

    static_ford<decltype(ref_desc.GetLengths().PopBack())>{}([=](auto Ids) {
-        static_for<0, nRead, 1>{}([=](auto IRead) {
+        static_for<0, nRead, 1>{}([&](auto IRead) {
            constexpr auto multi_id = decltype(Ids){}.PushBack(Number<IRead.Get() * DataPerRead>{});

            const index_t src_index = src_desc.Get1dIndex(multi_id);
@@ -62,3 +62,131 @@ __device__ void threadwise_nd_tensor_copy(SrcDesc,
        });
    });
 }
+
+// write in order of src
+template <class SrcData,
+          class DstData,
+          class SrcDesc,
+          class DstDesc,
+          class SrcOpLengths,
+          class MapDst2Src>
+__device__ void
+threadwise_nd_tensor_copy_reorder_given_dst2src_v1(SrcDesc,
+                                                   const SrcData* __restrict__ p_src,
+                                                   DstDesc,
+                                                   DstData* __restrict__ p_dst,
+                                                   SrcOpLengths,
+                                                   MapDst2Src)
+{
+    constexpr auto src_desc = SrcDesc{};
+    constexpr auto dst_desc = DstDesc{};
+
+    ford<SrcOpLengths>{}([&](auto src_multi_id) {
+        const auto dst_multi_id = reorder_array_given_new2old(src_multi_id, MapDst2Src{});
+
+        const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
+
+        const index_t src_index = src_desc.Get1dIndex(src_multi_id);
+
+        p_dst[dst_index] = p_src[src_index];
+    });
+}
+
+// write in order of dst
+template <class SrcData,
+          class DstData,
+          class SrcDesc,
+          class DstDesc,
+          class SrcOpLengths,
+          class MapDst2Src>
+__device__ void
+threadwise_nd_tensor_copy_reorder_given_dst2src_v2(SrcDesc,
+                                                   const SrcData* __restrict__ p_src,
+                                                   DstDesc,
+                                                   DstData* __restrict__ p_dst,
+                                                   SrcOpLengths,
+                                                   MapDst2Src)
+{
+    constexpr auto src_desc = SrcDesc{};
+    constexpr auto dst_desc = DstDesc{};
+
+    constexpr auto dst_op_lengths = SrcOpLengths{}.ReorderGivenNew2Old(MapDst2Src{});
+
+    ford<decltype(dst_op_lengths)>{}([&](auto dst_multi_id) {
+        const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
+
+        const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
+
+        const index_t src_index = src_desc.Get1dIndex(src_multi_id);
+
+        p_dst[dst_index] = p_src[src_index];
+    });
+}
+
+// write in order of dst
+template <class Float,
+          class SrcDesc,
+          class DstDesc,
+          class SrcOpLengths,
+          class MapDst2Src,
+          index_t DstDataPerWrite>
+__device__ void threadwise_nd_tensor_copy_reorder_given_dst2src_v3(SrcDesc,
+                                                                   const Float* __restrict__ p_src,
+                                                                   DstDesc,
+                                                                   Float* __restrict__ p_dst,
+                                                                   SrcOpLengths,
+                                                                   MapDst2Src,
+                                                                   Number<DstDataPerWrite>)
+{
+    using vector_t = typename vector_type<Float, DstDataPerWrite>::MemoryType;
+
+    constexpr index_t nDim = SrcOpLengths::GetSize();
+
+    static_assert(DstDataPerWrite == 1 || DstDesc{}.GetStride(Number<nDim - 1>{}) == 1,
+                  "wrong! only support dst.stride[nDim-1] == 1, if DstDataPerWrite != 1");
+
+    static_assert(DstDataPerWrite == 1 || DstDataPerWrite == 2 || DstDataPerWrite == 4,
+                  "wrong! only support DstDataPerWrite == 1, 2 or 4");
+
+    static_assert(
+        DstDesc{}.GetStride(Number<nDim - 2>{}) % DstDataPerWrite == 0,
+        "wrong! dst.stride[nDim-2] should be multiple of DstDataPerWrite to keep alignment");
+
+    constexpr auto src_desc = SrcDesc{};
+    constexpr auto dst_desc = DstDesc{};
+
+    constexpr auto dst_op_lengths = SrcOpLengths{}.ReorderGivenNew2Old(MapDst2Src{});
+
+    constexpr index_t L_Dst_Back = dst_op_lengths.Back();
+
+    static_assert(L_Dst_Back % DstDataPerWrite == 0,
+                  "wrong! dst.lengths[nDim-1] should be evenly divided by DstDataPerWrite");
+
+    constexpr index_t nWrite = L_Dst_Back / DstDataPerWrite;
+
+    ford<decltype(dst_op_lengths.PopBack())>{}([&](auto ids) {
+        static_for<0, nWrite, 1>{}([&](auto IWrite) {
+            vector_t dst_vec_data;
+
+            // pack data
+            static_for<0, DstDataPerWrite, 1>{}([&](auto IDstData) {
+                const auto dst_multi_id =
+                    ids.PushBack(IWrite.Get() * DstDataPerWrite + IDstData.Get());
+
+                const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
+
+                const index_t src_index = src_desc.Get1dIndex(src_multi_id);
+
+                vector_type<Float, DstDataPerWrite>::SetScalar(
+                    dst_vec_data, p_src[src_index], IDstData);
+            });
+
+            // write data
+            const auto dst_multi_id = ids.PushBack(IWrite.Get() * DstDataPerWrite);
+
+            const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id);
+
+            *(reinterpret_cast<vector_t*>(&p_dst[dst_index])) = dst_vec_data;
+        });
+    });
+}