refactor direct

24d2f034 · Chao Liu · 8732ea04 · 24d2f034 · 24d2f034 · 24d2f034
Commit 24d2f034 authored Nov 25, 2018 by Chao Liu
14 changed files
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -7,7 +7,6 @@
 #include "constant_tensor_descriptor.cuh"
 #include "device_direct_convolution_1.cuh"
 #include "device_direct_convolution_2.cuh"
-#include "device_winograd_convolution.cuh"
 struct GeneratorConstant
 {
@@ -61,10 +60,10 @@ void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::
 {
    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
-    constexpr auto I0   = Index<0>{};
+    constexpr auto I0   = Number<0>{};
-    constexpr auto I1   = Index<1>{};
+    constexpr auto I1   = Number<1>{};
-    constexpr auto I2   = Index<2>{};
+    constexpr auto I2   = Number<2>{};
-    constexpr auto I3   = Index<3>{};
+    constexpr auto I3   = Number<3>{};
    constexpr auto desc = TConstTensorDesc{};
    os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", "
@@ -79,10 +78,10 @@ auto make_TensorDescriptor(TConstTensorDesc)
 {
    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
-    constexpr auto I0   = Index<0>{};
+    constexpr auto I0   = Number<0>{};
-    constexpr auto I1   = Index<1>{};
+    constexpr auto I1   = Number<1>{};
-    constexpr auto I2   = Index<2>{};
+    constexpr auto I2   = Number<2>{};
-    constexpr auto I3   = Index<3>{};
+    constexpr auto I3   = Number<3>{};
    constexpr auto desc = TConstTensorDesc{};
    std::initializer_list<unsigned> lengths = {
@@ -396,7 +395,7 @@ int main()
    Tensor<float> out_host(make_TensorDescriptor(out_desc));
    Tensor<float> out_device(make_TensorDescriptor(out_desc));
-#if 0
+#if 1
    std::size_t num_thread = std::thread::hardware_concurrency();
    in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
    wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
@@ -405,17 +404,17 @@ int main()
    for(int i = 0; i < 20; ++i)
    {
 #if 1
-        device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device);
+        device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device);
 #else
        device_winograd_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
 #endif
    }
-#if 0
+#if 1
-    host_direct_convolution(in, wei, out_host);
+    host_winograd_3x3_convolution(in, wei, out_host);
    check_error(out_host, out_device);
 #elif 0
-    host_winograd_3x3_convolution(in, wei, out_host);
+    host_direct_convolution(in, wei, out_host);
    check_error(out_host, out_device);
 #endif

--- a/driver/device_direct_convolution_1.cuh
+++ b/driver/device_direct_convolution_1.cuh
@@ -16,10 +16,10 @@ void device_direct_convolution_1(
    wei_device_buf.ToDevice(wei.mData.data());
    out_device_buf.ToDevice(out.mData.data());
-    constexpr auto I0 = Index<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Index<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Index<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I3 = Number<3>{};
    constexpr auto in_desc          = InDesc{};
    constexpr auto wei_desc         = WeiDesc{};

--- a/driver/device_direct_convolution_2.cuh
+++ b/driver/device_direct_convolution_2.cuh
@@ -16,10 +16,10 @@ void device_direct_convolution_2(
    wei_device_buf.ToDevice(wei.mData.data());
    out_device_buf.ToDevice(out.mData.data());
-    constexpr auto I0 = Index<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Index<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Index<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I3 = Number<3>{};
    constexpr auto in_desc          = InDesc{};
    constexpr auto wei_desc         = WeiDesc{};
@@ -36,11 +36,6 @@ void device_direct_convolution_2(
    constexpr unsigned KPerThread = 4;
    constexpr unsigned CPerThread = 2;
-    constexpr unsigned NBlockOpLen0 = 1;
-    constexpr unsigned NBlockOpLen1 = 1;
-    constexpr unsigned NBlockOpLen2 = 4;
-    constexpr unsigned NBlockOpLen3 = 32;
    constexpr unsigned BlockSize = 128;
    constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
@@ -73,10 +68,6 @@ void device_direct_convolution_2(
                                  NPerThread,
                                  KPerThread,
                                  CPerThread,
-                                  NBlockOpLen0,
-                                  NBlockOpLen1,
-                                  NBlockOpLen2,
-                                  NBlockOpLen3,
                                  BlockSize,
                                  GridSize>
        <<<grid_dim, block_dim>>>(InDesc{},

--- a/driver/device_winograd_convolution.cuh
+++ b/driver/device_winograd_convolution.cuh
-#pragma once
-#include "gridwise_winograd_convolution.cuh"
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_winograd_convolution(
-    InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
-{
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
-    int num_thread = std::thread::hardware_concurrency();
-    in_device_buf.ToDevice(in.mData.data());
-    wei_device_buf.ToDevice(wei.mData.data());
-    out_device_buf.ToDevice(out.mData.data());
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-    constexpr auto in_desc          = InDesc{};
-    constexpr auto wei_desc         = WeiDesc{};
-    constexpr auto out_desc         = OutDesc{};
-    constexpr unsigned OutTileSizeH = 2;
-    constexpr unsigned OutTileSizeW = 2;
-    constexpr unsigned NPerBlock    = 2;
-    constexpr unsigned KPerBlock    = 16;
-    constexpr unsigned CPerBlock    = 4;
-    constexpr unsigned YPerBlock    = 1;
-    constexpr unsigned XPerBlock    = 16;
-    constexpr unsigned NPerThread = 2;
-    constexpr unsigned KPerThread = 2;
-    constexpr unsigned CPerThread = 2;
-    constexpr unsigned BlockSize = 128;
-    constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
-                                  (out_desc.GetLength(I1) / KPerBlock) *
-                                  (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) *
-                                  (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));
-    dim3 block_dim(BlockSize);
-    dim3 grid_dim(GridSize);
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    cudaEvent_t start, stop;
-    float elapsedTime;
-    cudaEventCreate(&start);
-    cudaEventRecord(start, 0);
-    gridwise_winograd_convolution<T,
-                                  InDesc,
-                                  WeiDesc,
-                                  OutDesc,
-                                  OutTileSizeH,
-                                  OutTileSizeW,
-                                  NPerBlock,
-                                  KPerBlock,
-                                  CPerBlock,
-                                  YPerBlock,
-                                  XPerBlock,
-                                  NPerThread,
-                                  KPerThread,
-                                  CPerThread,
-                                  BlockSize,
-                                  GridSize>
-        <<<grid_dim, block_dim>>>(InDesc{},
-                                  static_cast<T*>(in_device_buf.GetDeviceBuffer()),
-                                  WeiDesc{},
-                                  static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
-                                  OutDesc{},
-                                  static_cast<T*>(out_device_buf.GetDeviceBuffer()));
-    cudaEventCreate(&stop);
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&elapsedTime, start, stop);
-    printf("Elapsed time : %f ms\n", elapsedTime);
-    checkCudaErrors(cudaGetLastError());
-    out_device_buf.FromDevice(out.mData.data());
-}
\ No newline at end of file
--- a/src/include/blockwise_direct_convolution.cuh
+++ b/src/include/blockwise_direct_convolution.cuh
@@ -17,10 +17,10 @@ __device__ void blockwise_convolution(InBlockDesc,
                                      OutBlockDesc,
                                      TFloat* __restrict__ p_out_block)
 {
-    constexpr auto I0 = Index<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Index<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Index<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I3 = Number<3>{};
    constexpr auto in_block_desc  = InBlockDesc{};
    constexpr auto wei_block_desc = WeiBlockDesc{};
@@ -88,72 +88,50 @@ __device__ void blockwise_convolution(InBlockDesc,
        TFloat p_wei_thread[wei_thread_src_desc.GetElementSpace()];
        TFloat p_out_thread[out_thread_src_desc.GetElementSpace()];
-        auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
        // copy input tensor into register
-        threadwise_4d_tensor_op_binary<TFloat,
+        threadwise_4d_tensor_copy(
-                                       decltype(in_thread_src_desc),
-                                       decltype(in_thread_dst_desc),
-                                       decltype(f_copy)>(
            in_thread_src_desc,
            p_in_block + in_block_desc.Get1dIndex(
                             n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin),
            in_thread_dst_desc,
-            p_in_thread,
+            p_in_thread);
-            f_copy);
        for(unsigned k_thread_work_begin = 0; k_thread_work_begin < KPerBlock;
            ++k_thread_work_begin)
        {
            // copy weight tensor into register
-            threadwise_4d_tensor_op_binary<TFloat,
+            threadwise_4d_tensor_copy(wei_thread_src_desc,
-                                           decltype(wei_thread_src_desc),
+                                      p_wei_block +
-                                           decltype(wei_thread_dst_desc),
+                                          wei_block_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0),
-                                           decltype(f_copy)>(
+                                      wei_thread_dst_desc,
-                wei_thread_src_desc,
+                                      p_wei_thread);
-                p_wei_block + wei_block_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0),
-                wei_thread_dst_desc,
-                p_wei_thread,
-                f_copy);
            // copy output tensor into register
-            threadwise_4d_tensor_op_binary<TFloat,
+            threadwise_4d_tensor_copy(out_thread_src_desc,
-                                           decltype(out_thread_src_desc),
+                                      p_out_block + out_block_desc.Get1dIndex(n_thread_work_begin,
-                                           decltype(out_thread_dst_desc),
+                                                                              k_thread_work_begin,
-                                           decltype(f_copy)>(
+                                                                              ho_thread_work_begin,
-                out_thread_src_desc,
+                                                                              wo_thread_work_begin),
-                p_out_block + out_block_desc.Get1dIndex(n_thread_work_begin,
+                                      out_thread_dst_desc,
-                                                        k_thread_work_begin,
+                                      p_out_thread);
-                                                        ho_thread_work_begin,
-                                                        wo_thread_work_begin),
-                out_thread_dst_desc,
-                p_out_thread,
-                f_copy);
            // threadwise convolution
-            threadwise_direct_convolution<TFloat,
+            threadwise_direct_convolution(in_thread_dst_desc,
-                                          decltype(in_thread_dst_desc),
+                                          p_in_thread,
-                                          decltype(wei_thread_dst_desc),
+                                          wei_thread_dst_desc,
-                                          decltype(out_thread_dst_desc)>(in_thread_dst_desc,
+                                          p_wei_thread,
-                                                                         p_in_thread,
+                                          out_thread_dst_desc,
-                                                                         wei_thread_dst_desc,
+                                          p_out_thread);
-                                                                         p_wei_thread,
-                                                                         out_thread_dst_desc,
-                                                                         p_out_thread);
            // accumulate output tensor into LDS
-            threadwise_4d_tensor_op_binary<TFloat,
+            threadwise_4d_tensor_copy(out_thread_dst_desc,
-                                           decltype(out_thread_dst_desc),
+                                      p_out_thread,
-                                           decltype(out_thread_src_desc),
+                                      out_thread_src_desc,
-                                           decltype(f_copy)>(
+                                      p_out_block +
-                out_thread_dst_desc,
+                                          out_block_desc.Get1dIndex(n_thread_work_begin,
-                p_out_thread,
+                                                                    k_thread_work_begin,
-                out_thread_src_desc,
+                                                                    ho_thread_work_begin,
-                p_out_block + out_block_desc.Get1dIndex(n_thread_work_begin,
+                                                                    wo_thread_work_begin));
-                                                        k_thread_work_begin,
-                                                        ho_thread_work_begin,
-                                                        wo_thread_work_begin),
-                f_copy);
        }
    }
 }
--- a/src/include/blockwise_tensor_op.cuh
+++ b/src/include/blockwise_tensor_op.cuh
 #pragma once
 #include "constant_tensor_descriptor.cuh"
-#define BLOCKWISE_TENSOR_OP_METHOD 12
+template <class TFloat, class DstDesc, class F, unsigned BlockSize>
+__device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
-#if BLOCKWISE_TENSOR_OP_METHOD == 11
-template <class TFloat,
-          class SrcDesc,
-          class DstDesc,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
 {
-    constexpr auto I0 = Index<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Index<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Index<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I3 = Number<3>{};
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-    constexpr auto desc = make_ConstantTensorDescriptor(src_desc.GetLengths());
-#if 0
-    if(threadIdx.x == 0)
-    {
-        print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op_binary: src_desc: ");
-        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_binary: dst_desc: ");
-    }
-#endif
-    for(unsigned i = threadIdx.x; i < desc.GetElementSize(); i += BlockSize)
-    {
-        unsigned is = i;
-        const unsigned did0 = is / desc.GetStride(I0);
-        is -= did0 * desc.GetStride(I0);
-        const unsigned did1 = is / desc.GetStride(I1);
-        is -= did1 * desc.GetStride(I1);
-        const unsigned did2 = is / desc.GetStride(I2);
-        is -= did2 * desc.GetStride(I2);
-        const unsigned did3 = is / desc.GetStride(I3);
-        const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
-        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
-        f(p_src[sindex], p_dst[dindex]);
-    }
-}
-#endif
-#if BLOCKWISE_TENSOR_OP_METHOD == 12
-template <class TFloat,
-          class SrcDesc,
-          class DstDesc,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
-{
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
+    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
-    constexpr auto desc = make_ConstantTensorDescriptor(src_desc.GetLengths());
 #if 0
    if(threadIdx.x == 0)
    {
-        print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op_binary: src_desc: ");
+        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
-        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_binary: dst_desc: ");
+        print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
    }
 #endif
@@ -116,11 +41,9 @@ __device__ void blockwise_4d_tensor_op_binary(
        const unsigned did3 = is / desc.GetStride(I3);
-        const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
-        f(p_src[sindex], p_dst[dindex]);
+        f(p_dst[dindex]);
    }
    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);
@@ -145,39 +68,34 @@ __device__ void blockwise_4d_tensor_op_binary(
            const unsigned did3 = is / desc.GetStride(I3);
-            const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
-            f(p_src[sindex], p_dst[dindex]);
+            f(p_dst[dindex]);
        }
    }
 }
-template <class TFloat,
+template <class TFloat, class SrcDesc, class DstDesc, class F, unsigned BlockSize>
-          class DstDesc,
+__device__ void blockwise_4d_tensor_pointwise_op_binary(
-          unsigned NBlockOpLen0,
+    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
 {
-    constexpr auto I0 = Index<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Index<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Index<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
-    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
+    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
+    constexpr auto desc = make_ConstantTensorDescriptor(src_desc.GetLengths());
 #if 0
    if(threadIdx.x == 0)
    {
-        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
+        print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op_binary: src_desc: ");
-        print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
+        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_binary: dst_desc: ");
    }
 #endif
@@ -201,9 +119,11 @@ __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst
        const unsigned did3 = is / desc.GetStride(I3);
+        const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
-        f(p_dst[dindex]);
+        f(p_src[sindex], p_dst[dindex]);
    }
    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);
@@ -228,312 +148,44 @@ __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst
            const unsigned did3 = is / desc.GetStride(I3);
-            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
+            const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
-            f(p_dst[dindex]);
-        }
-    }
-}
-#endif
-#if BLOCKWISE_TENSOR_OP_METHOD == 21
-template <class TFloat,
-          class SrcDesc,
-          class DstDesc,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
-{
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-    constexpr unsigned NBlockOpStride3 = 1;
-    constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3;
-    constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2;
-    constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1;
-    unsigned itmp = threadIdx.x;
-    const unsigned did0_begin = itmp / NBlockOpStride0;
-    itmp -= did0_begin * NBlockOpStride0;
-    const unsigned did1_begin = itmp / NBlockOpStride1;
-    itmp -= did1_begin * NBlockOpStride1;
-    const unsigned did2_begin = itmp / NBlockOpStride2;
-    itmp -= did2_begin * NBlockOpStride2;
-    const unsigned did3_begin = itmp / NBlockOpStride3;
-    for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0)
-    {
-        for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1)
-        {
-            for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2)
-            {
-                for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3)
-                {
-                    const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
-                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
+            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
-                    f(p_src[sindex], p_dst[dindex]);
+            f(p_src[sindex], p_dst[dindex]);
-                }
-            }
        }
    }
 }
-#endif
-#if BLOCKWISE_TENSOR_OP_METHOD == 22
+template <class TFloat, class DstDesc, unsigned BlockSize>
-template <class TFloat,
+__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst)
-          class SrcDesc,
-          class DstDesc,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
 {
-    constexpr auto I0 = Index<0>{};
+    auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-    constexpr unsigned NBlockOpStride3 = 1;
-    constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3;
-    constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2;
-    constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1;
-    unsigned itmp = threadIdx.x;
+    blockwise_4d_tensor_pointwise_op_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
+        DstDesc{}, p_dst, f_set_zero);
-    const unsigned did0_begin = itmp / NBlockOpStride0;
-    itmp -= did0_begin * NBlockOpStride0;
-    const unsigned did1_begin = itmp / NBlockOpStride1;
-    itmp -= did1_begin * NBlockOpStride1;
-    const unsigned did2_begin = itmp / NBlockOpStride2;
-    itmp -= did2_begin * NBlockOpStride2;
-    const unsigned did3_begin = itmp / NBlockOpStride3;
-    unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin);
-    unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin);
-    for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0)
-    {
-        const unsigned sindex_save0 = sindex;
-        const unsigned dindex_save0 = dindex;
-        for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1)
-        {
-            const unsigned sindex_save1 = sindex;
-            const unsigned dindex_save1 = dindex;
-            for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2)
-            {
-                const unsigned sindex_save2 = sindex;
-                const unsigned dindex_save2 = dindex;
-                for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3)
-                {
-                    f(p_src[sindex], p_dst[dindex]);
-                    sindex += NBlockOpLen3 * src_desc.GetStride(I3);
-                    dindex += NBlockOpLen3 * dst_desc.GetStride(I3);
-                }
-                sindex = sindex_save2 + NBlockOpLen2 * src_desc.GetStride(I2);
-                dindex = dindex_save2 + NBlockOpLen2 * dst_desc.GetStride(I2);
-            }
-            sindex = sindex_save1 + NBlockOpLen1 * src_desc.GetStride(I1);
-            dindex = dindex_save1 + NBlockOpLen1 * dst_desc.GetStride(I1);
-        }
-        sindex = sindex_save0 + NBlockOpLen0 * src_desc.GetStride(I0);
-        dindex = dindex_save0 + NBlockOpLen0 * dst_desc.GetStride(I0);
-    }
 }
-#endif
-#if BLOCKWISE_TENSOR_OP_METHOD == 23
+template <class TFloat, class SrcDesc, class DstDesc, unsigned BlockSize>
-template <class TFloat,
+__device__ void blockwise_4d_tensor_copy(SrcDesc,
-          class SrcDesc,
+                                         TFloat* const __restrict__ p_src,
-          class DstDesc,
+                                         DstDesc,
-          unsigned NBlockOpLen0,
+                                         TFloat* __restrict__ p_dst)
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
 {
-    constexpr auto I0 = Index<0>{};
+    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-    constexpr auto src_desc = SrcDesc{};
+    blockwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, decltype(f_copy), BlockSize>(
-    constexpr auto dst_desc = DstDesc{};
+        SrcDesc{}, p_src, DstDesc{}, p_dst, f_copy);
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-    constexpr unsigned NBlockOpStride3 = 1;
-    constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3;
-    constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2;
-    constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1;
-    unsigned itmp = threadIdx.x;
-    const unsigned did0_begin = itmp / NBlockOpStride0;
-    itmp -= did0_begin * NBlockOpStride0;
-    const unsigned did1_begin = itmp / NBlockOpStride1;
-    itmp -= did1_begin * NBlockOpStride1;
-    const unsigned did2_begin = itmp / NBlockOpStride2;
-    itmp -= did2_begin * NBlockOpStride2;
-    const unsigned did3_begin = itmp / NBlockOpStride3;
-    unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin);
-    unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin);
-    for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0)
-    {
-        unsigned i1 = 0;
-        for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1)
-        {
-            unsigned i2 = 0;
-            for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2)
-            {
-                unsigned i3 = 0;
-                for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3)
-                {
-                    f(p_src[sindex], p_dst[dindex]);
-                    sindex += NBlockOpLen3 * src_desc.GetStride(I3);
-                    dindex += NBlockOpLen3 * dst_desc.GetStride(I3);
-                    ++i3;
-                }
-                sindex += NBlockOpLen2 * src_desc.GetStride(I2) -
-                          i3 * NBlockOpLen3 * src_desc.GetStride(I3);
-                dindex += NBlockOpLen2 * dst_desc.GetStride(I2) -
-                          i3 * NBlockOpLen3 * dst_desc.GetStride(I3);
-                ++i2;
-            }
-            sindex +=
-                NBlockOpLen1 * src_desc.GetStride(I1) - i2 * NBlockOpLen2 * src_desc.GetStride(I2);
-            dindex +=
-                NBlockOpLen1 * dst_desc.GetStride(I1) - i2 * NBlockOpLen2 * dst_desc.GetStride(I2);
-            ++i1;
-        }
-        sindex +=
-            NBlockOpLen0 * src_desc.GetStride(I0) - i1 * NBlockOpLen1 * src_desc.GetStride(I1);
-        dindex +=
-            NBlockOpLen0 * dst_desc.GetStride(I0) - i1 * NBlockOpLen1 * dst_desc.GetStride(I1);
-    }
 }
-#endif
-#if BLOCKWISE_TENSOR_OP_METHOD == 31
+template <class TFloat, class SrcDesc, class DstDesc, unsigned BlockSize>
-template <class TFloat,
+__device__ void blockwise_4d_tensor_accumulate(SrcDesc,
-          class SrcDesc,
+                                               TFloat* const __restrict__ p_src,
-          class DstDesc,
+                                               DstDesc,
-          unsigned NBlockOpLen0,
+                                               TFloat* __restrict__ p_dst)
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
 {
-    constexpr auto I0 = Index<0>{};
+    auto f_accum = [](const TFloat& src, TFloat& dst) { dst += src; };
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-    constexpr auto src_desc = SrcDesc{};
+    blockwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, decltype(f_accum), BlockSize>(
-    constexpr auto dst_desc = DstDesc{};
+        SrcDesc{}, p_src, DstDesc{}, p_dst, f_accum);
+}
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
\ No newline at end of file
-    constexpr unsigned NBlockOpStride3 = 1;
-    constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3;
-    constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2;
-    constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1;
-    unsigned itmp = threadIdx.x;
-    const unsigned did0_begin = itmp / NBlockOpStride0;
-    itmp -= did0_begin * NBlockOpStride0;
-    const unsigned did1_begin = itmp / NBlockOpStride1;
-    itmp -= did1_begin * NBlockOpStride1;
-    const unsigned did2_begin = itmp / NBlockOpStride2;
-    itmp -= did2_begin * NBlockOpStride2;
-    const unsigned did3_begin = itmp / NBlockOpStride3;
-    for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0)
-    {
-        for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1)
-        {
-            for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2)
-            {
-                for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3)
-                {
-                    const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
-                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
-                    f(p_src[sindex], p_dst[dindex]);
-                }
-            }
-        }
-    }
-}
-#endif
--- a/src/include/blockwise_winograd_transform.cuh
+++ b/src/include/blockwise_winograd_transform.cuh
-#pragma once
-#include "constant_tensor_descriptor.cuh"
-template <class TFloat,
-          unsigned InTileSizeH,
-          unsigned InTileSizeW,
-          unsigned S,
-          unsigned R,
-          unsigned OutTileSizeH,
-          unsigned OutTileSizeW,
-          unsigned NPerBlock,
-          unsigned CPerBlock,
-          unsigned YPerBlock,
-          unsigned XPerBlock,
-          unsigned BlockSize>
-__device__ void blockwise_winograd_transform_input(TFloat* const __restrict__ p_in,
-                                                   TFloat* __restrict__ p_in_transform)
-{
-    p_in_transform[0] = 1;
-}
-template <class TFloat,
-          unsigned InTileSizeH,
-          unsigned InTileSizeW,
-          unsigned S,
-          unsigned R,
-          unsigned OutTileSizeH,
-          unsigned OutTileSizeW,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned BlockSize>
-__device__ void blockwise_winograd_transform_weight(TFloat* const __restrict__ p_wei,
-                                                    TFloat* __restrict__ p_wei_transform)
-{
-    p_wei_transform[0] = 1;
-}
\ No newline at end of file
--- a/src/include/constant_tensor_descriptor.cuh
+++ b/src/include/constant_tensor_descriptor.cuh
@@ -7,8 +7,8 @@ struct Constant
    const T mValue = N;
 };
-template <unsigned I>
+template <unsigned N>
-using Index = Constant<unsigned, I>;
+using Number = Constant<unsigned, N>;
 template <unsigned... Is>
 struct Sequence
@@ -18,7 +18,7 @@ struct Sequence
    const unsigned mData[nDim] = {Is...};
    template <unsigned I>
-    __host__ __device__ constexpr unsigned Get(Index<I>) const
+    __host__ __device__ constexpr unsigned Get(Number<I>) const
    {
        return mData[I];
    }
@@ -28,7 +28,7 @@ template <class Lengths, class Strides>
 struct ConstantTensorDescriptor
 {
    static constexpr unsigned nDim = Lengths::nDim;
-    using NDimConstant             = Index<nDim>;
+    using NDimConstant             = Number<nDim>;
    __host__ __device__ constexpr ConstantTensorDescriptor()
    {
@@ -42,15 +42,15 @@ struct ConstantTensorDescriptor
    __host__ __device__ constexpr Strides GetStrides() const { return Strides{}; }
    template <unsigned I>
-    __host__ __device__ constexpr unsigned GetLength(Index<I>) const
+    __host__ __device__ constexpr unsigned GetLength(Number<I>) const
    {
-        return Lengths{}.Get(Index<I>{});
+        return Lengths{}.Get(Number<I>{});
    }
    template <unsigned I>
-    __host__ __device__ constexpr unsigned GetStride(Index<I>) const
+    __host__ __device__ constexpr unsigned GetStride(Number<I>) const
    {
-        return Strides{}.Get(Index<I>{});
+        return Strides{}.Get(Number<I>{});
    }
    // this is ugly, only for 4d
@@ -58,10 +58,10 @@ struct ConstantTensorDescriptor
    {
        static_assert(nDim == 4, "nDim is not 4");
-        constexpr auto I0 = Index<0>{};
+        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Index<1>{};
+        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Index<2>{};
+        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Index<3>{};
+        constexpr auto I3 = Number<3>{};
        return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3);
    }
@@ -71,10 +71,10 @@ struct ConstantTensorDescriptor
    {
        static_assert(nDim == 4, "nDim is not 4");
-        constexpr auto I0 = Index<0>{};
+        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Index<1>{};
+        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Index<2>{};
+        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Index<3>{};
+        constexpr auto I3 = Number<3>{};
        return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
               (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + 1;
@@ -83,10 +83,10 @@ struct ConstantTensorDescriptor
    // this is ugly, only for 4d
    __host__ __device__ unsigned Get1dIndex(unsigned n, unsigned c, unsigned h, unsigned w) const
    {
-        constexpr auto I0 = Index<0>{};
+        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Index<1>{};
+        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Index<2>{};
+        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Index<3>{};
+        constexpr auto I3 = Number<3>{};
        static_assert(nDim == 4, "nDim is not 4");
        return n * GetStride(I0) + c * GetStride(I1) + h * GetStride(I2) + w * GetStride(I3);
@@ -120,10 +120,10 @@ __host__ __device__ constexpr auto get_output_4d_tensor_descriptor(InDesc, WeiDe
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};
-    constexpr auto I0 = Index<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Index<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Index<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I3 = Number<3>{};
    static_assert(in_desc.GetDimension() == 4, "input nDim is not 4");
    static_assert(wei_desc.GetDimension() == 4, "weight nDim is not 4");
@@ -150,10 +150,10 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
 {
    constexpr auto desc = TDesc{};
-    constexpr auto I0 = Index<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Index<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Index<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I3 = Number<3>{};
    static_assert(desc.GetDimension() == 4, "dim is not 4");

--- a/src/include/gridwise_direct_convolution_1.cuh
+++ b/src/include/gridwise_direct_convolution_1.cuh
@@ -27,10 +27,10 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
                                              OutGlobalDesc,
                                              TFloat* __restrict__ p_out_global)
 {
-    constexpr auto I0 = Index<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Index<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Index<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I3 = Number<3>{};
    constexpr auto in_global_desc  = InGlobalDesc{};
    constexpr auto wei_global_desc = WeiGlobalDesc{};
@@ -120,62 +120,38 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
    }
 #endif
-    auto f_set0 = [](TFloat& v) { v = TFloat(0); };
-    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
-    auto f_accu = [](const TFloat& src, TFloat& dst) { dst += src; };
    // set output tensor in LDS to 0
-    blockwise_4d_tensor_op_unary<TFloat,
+    blockwise_4d_tensor_set_zero<TFloat, decltype(out_block_desc), BlockSize>(out_block_desc,
-                                 decltype(out_block_desc),
+                                                                              p_out_block);
-                                 NBlockOpLen0,
-                                 NBlockOpLen1,
-                                 NBlockOpLen2,
-                                 NBlockOpLen3,
-                                 decltype(f_set0),
-                                 BlockSize>(out_block_desc, p_out_block, f_set0);
    for(unsigned c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
-        c_block_work_begin += CPerBlock)
+        c_block_work_begin += CPerBlock, __syncthreads())
    {
        // copy input tensor to LDS
-        blockwise_4d_tensor_op_binary<TFloat,
+        blockwise_4d_tensor_copy<TFloat,
-                                      decltype(in_block_src_desc),
+                                 decltype(in_block_src_desc),
-                                      decltype(in_block_desc),
+                                 decltype(in_block_desc),
-                                      NBlockOpLen0,
+                                 BlockSize>(in_block_src_desc,
-                                      NBlockOpLen1,
+                                            p_in_global +
-                                      NBlockOpLen2,
+                                                in_global_desc.Get1dIndex(n_block_work_begin,
-                                      NBlockOpLen3,
+                                                                          c_block_work_begin,
-                                      decltype(f_copy),
+                                                                          hi_block_work_begin,
-                                      BlockSize>(in_block_src_desc,
+                                                                          wi_block_work_begin),
-                                                 p_in_global +
+                                            in_block_desc,
-                                                     in_global_desc.Get1dIndex(n_block_work_begin,
+                                            p_in_block);
-                                                                               c_block_work_begin,
-                                                                               hi_block_work_begin,
-                                                                               wi_block_work_begin),
-                                                 in_block_desc,
-                                                 p_in_block,
-                                                 f_copy);
        // copy weight tensor to LDS
-        blockwise_4d_tensor_op_binary<TFloat,
+        blockwise_4d_tensor_copy<TFloat,
-                                      decltype(wei_block_src_desc),
+                                 decltype(wei_block_src_desc),
-                                      decltype(wei_block_desc),
+                                 decltype(wei_block_desc),
-                                      NBlockOpLen0,
+                                 BlockSize>(
-                                      NBlockOpLen1,
-                                      NBlockOpLen2,
-                                      NBlockOpLen3,
-                                      decltype(f_copy),
-                                      BlockSize>(
            wei_block_src_desc,
            p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
            wei_block_desc,
-            p_wei_block,
+            p_wei_block);
-            f_copy);
-#if 1
        __syncthreads();
-#endif
        // blockwise convolution
        blockwise_convolution<TFloat,
@@ -186,27 +162,17 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
                              OutTileSizeW,
                              BlockSize>(
            in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block);
-#if 1
-        __syncthreads();
-#endif
    }
    // copy output tensor from LDS to device mem
-    blockwise_4d_tensor_op_binary<TFloat,
+    blockwise_4d_tensor_copy<TFloat,
-                                  decltype(out_block_desc),
+                             decltype(out_block_desc),
-                                  decltype(out_block_src_desc),
+                             decltype(out_block_src_desc),
-                                  NBlockOpLen0,
+                             BlockSize>(
-                                  NBlockOpLen1,
-                                  NBlockOpLen2,
-                                  NBlockOpLen3,
-                                  decltype(f_copy),
-                                  BlockSize>(
        out_block_desc,
        p_out_block,
        out_block_src_desc,
        p_out_global +
            out_global_desc.Get1dIndex(
-                n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin),
+                n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
-        f_copy);
+}
-}
\ No newline at end of file
--- a/src/include/gridwise_direct_convolution_2.cuh
+++ b/src/include/gridwise_direct_convolution_2.cuh
@@ -19,10 +19,6 @@ template <class TFloat,
          unsigned NPerThread,
          unsigned KPerThread,
          unsigned CPerThread,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
          unsigned BlockSize,
          unsigned GridSize>
 __global__ void gridwise_direct_convolution_2(InGlobalDesc,
@@ -32,10 +28,10 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
                                              OutGlobalDesc,
                                              TFloat* __restrict__ p_out_global)
 {
-    constexpr auto I0 = Index<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Index<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Index<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I3 = Number<3>{};
    constexpr auto in_global_desc  = InGlobalDesc{};
    constexpr auto wei_global_desc = WeiGlobalDesc{};
@@ -147,10 +143,6 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
    const unsigned hi_thread_data_offset = ho_thread_data_offset;
    const unsigned wi_thread_data_offset = wo_thread_data_offset;
-    // op
-    auto f_set0 = [](TFloat& v) { v = TFloat(0); };
-    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
 #if 0
    if(threadIdx.x == 0)
    {
@@ -170,76 +162,54 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
 #endif
    // set threadwise output tensor to 0
-    threadwise_4d_tensor_op_unary<TFloat, decltype(out_thread_desc), decltype(f_set0)>(
+    threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
-        out_thread_desc, p_out_thread, f_set0);
    for(unsigned c_block_data_offset = 0; c_block_data_offset < in_global_desc.GetLength(I1);
        c_block_data_offset += CPerBlock, __syncthreads())
    {
        // copy input tensor to LDS
-        blockwise_4d_tensor_op_binary<TFloat,
+        blockwise_4d_tensor_copy<TFloat,
-                                      decltype(in_block_global_desc),
+                                 decltype(in_block_global_desc),
-                                      decltype(in_block_desc),
+                                 decltype(in_block_desc),
-                                      NBlockOpLen0,
+                                 BlockSize>(in_block_global_desc,
-                                      NBlockOpLen1,
+                                            p_in_global +
-                                      NBlockOpLen2,
+                                                in_global_desc.Get1dIndex(n_block_data_offset,
-                                      NBlockOpLen3,
+                                                                          c_block_data_offset,
-                                      decltype(f_copy),
+                                                                          hi_block_data_offset,
-                                      BlockSize>(
+                                                                          wi_block_data_offset),
-            in_block_global_desc,
+                                            in_block_desc,
-            p_in_global + in_global_desc.Get1dIndex(n_block_data_offset,
+                                            p_in_block);
-                                                    c_block_data_offset,
-                                                    hi_block_data_offset,
-                                                    wi_block_data_offset),
-            in_block_desc,
-            p_in_block,
-            f_copy);
        // copy weight tensor to LDS
-        blockwise_4d_tensor_op_binary<TFloat,
+        blockwise_4d_tensor_copy<TFloat,
-                                      decltype(wei_block_global_desc),
+                                 decltype(wei_block_global_desc),
-                                      decltype(wei_block_desc),
+                                 decltype(wei_block_desc),
-                                      NBlockOpLen0,
+                                 BlockSize>(
-                                      NBlockOpLen1,
-                                      NBlockOpLen2,
-                                      NBlockOpLen3,
-                                      decltype(f_copy),
-                                      BlockSize>(
            wei_block_global_desc,
            p_wei_global +
                wei_global_desc.Get1dIndex(k_block_data_offset, c_block_data_offset, 0, 0),
            wei_block_desc,
-            p_wei_block,
+            p_wei_block);
-            f_copy);
        __syncthreads();
        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
        {
            // copy input tensor into register
-            threadwise_4d_tensor_op_binary<TFloat,
+            threadwise_4d_tensor_copy(in_thread_block_desc,
-                                           decltype(in_thread_block_desc),
+                                      p_in_block + in_block_desc.Get1dIndex(n_thread_data_offset,
-                                           decltype(in_thread_desc),
+                                                                            c_thread_data,
-                                           decltype(f_copy)>(
+                                                                            hi_thread_data_offset,
-                in_thread_block_desc,
+                                                                            wi_thread_data_offset),
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_offset,
+                                      in_thread_desc,
-                                                      c_thread_data,
+                                      p_in_thread);
-                                                      hi_thread_data_offset,
-                                                      wi_thread_data_offset),
-                in_thread_desc,
-                p_in_thread,
-                f_copy);
            // copy weight tensor into register
-            threadwise_4d_tensor_op_binary<TFloat,
+            threadwise_4d_tensor_copy(
-                                           decltype(wei_thread_block_desc),
-                                           decltype(wei_thread_desc),
-                                           decltype(f_copy)>(
                wei_thread_block_desc,
                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_offset, c_thread_data, 0, 0),
                wei_thread_desc,
-                p_wei_thread,
+                p_wei_thread);
-                f_copy);
            // threadwise convolution
            threadwise_direct_convolution<TFloat,
@@ -255,16 +225,12 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
    }
    // copy output tensor from register to global mem
-    threadwise_4d_tensor_op_binary<TFloat,
+    threadwise_4d_tensor_copy(
-                                   decltype(out_thread_desc),
-                                   decltype(out_thread_global_desc),
-                                   decltype(f_copy)>(
        out_thread_desc,
        p_out_thread,
        out_thread_global_desc,
        p_out_global + out_global_desc.Get1dIndex(n_block_data_offset + n_thread_data_offset,
                                                  k_block_data_offset + k_thread_data_offset,
                                                  ho_block_data_offset + ho_thread_data_offset,
-                                                  wo_block_data_offset + wo_thread_data_offset),
+                                                  wo_block_data_offset + wo_thread_data_offset));
-        f_copy);
 }
--- a/src/include/gridwise_winograd_convolution.cuh
+++ b/src/include/gridwise_winograd_convolution.cuh
-#pragma once
-#include "constant_tensor_descriptor.cuh"
-#include "blockwise_winograd_transform.cuh"
-#include "threadwise_winograd_transform.cuh"
-template <class TFloat,
-          class InGlobalDesc,
-          class WeiGlobalDesc,
-          class OutGlobalDesc,
-          unsigned OutTileSizeH,
-          unsigned OutTileSizeW,
-          unsigned NPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned YPerBlock,
-          unsigned XPerBlock,
-          unsigned NPerThread,
-          unsigned KPerThread,
-          unsigned CPerThread,
-          unsigned BlockSize,
-          unsigned GridSize>
-__global__ void gridwise_winograd_convolution(InGlobalDesc,
-                                              TFloat* const __restrict__ p_in_global,
-                                              WeiGlobalDesc,
-                                              TFloat* const __restrict__ p_wei_global,
-                                              OutGlobalDesc,
-                                              TFloat* __restrict__ p_out_global)
-{
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-    constexpr auto in_global_desc  = InGlobalDesc{};
-    constexpr auto wei_global_desc = WeiGlobalDesc{};
-    constexpr auto out_global_desc = OutGlobalDesc{};
-    constexpr unsigned S = wei_global_desc.GetLength(I2);
-    constexpr unsigned R = wei_global_desc.GetLength(I3);
-    constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock;
-    constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock;
-    constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
-    constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;
-    constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
-    constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
-    // divide block work
-    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
-    const unsigned block_id = blockIdx.x;
-    unsigned itmp                  = block_id;
-    const unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork);
-    itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork);
-    const unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork);
-    itmp -= k_block_work_id * (YBlockWork * XBlockWork);
-    const unsigned y_block_work_id = itmp / XBlockWork;
-    const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork;
-    const unsigned n_block_data_offset = n_block_work_id * NPerBlock;
-    const unsigned k_block_data_offset = k_block_work_id * KPerBlock;
-    const unsigned y_block_data_offset = y_block_work_id * YPerBlock;
-    const unsigned x_block_data_offset = x_block_work_id * XPerBlock;
-    const unsigned ho_block_data_offset = y_block_data_offset * OutTileSizeH;
-    const unsigned wo_block_data_offset = x_block_data_offset * OutTileSizeW;
-    const unsigned hi_block_data_offset = ho_block_data_offset; // minus padding
-    const unsigned wi_block_data_offset = wo_block_data_offset; // minus padding
-    // divide thread work
-    constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
-    constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
-    constexpr unsigned YThreadWork = YPerBlock;
-    constexpr unsigned XThreadWork = XPerBlock;
-    const unsigned thread_id = threadIdx.x;
-    itmp                            = thread_id;
-    const unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork);
-    itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork);
-    const unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork);
-    itmp -= k_thread_work_id * (YThreadWork * XThreadWork);
-    const unsigned y_thread_work_id = itmp / XThreadWork;
-    const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork;
-    const unsigned n_thread_data_offset = n_thread_work_id * NPerThread;
-    const unsigned k_thread_data_offset = k_thread_work_id * KPerThread;
-    const unsigned y_thread_data_offset = y_thread_work_id;
-    const unsigned x_thread_data_offset = x_thread_work_id;
-    // op
-    auto f_set0 = [](TFloat& v) { v = TFloat(0); };
-    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
-    // block data
-    constexpr auto in_transform_block_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerBlock, CPerBlock, YPerBlock * InTileSizeH, XPerBlock * InTileSizeW>{});
-    constexpr auto wei_transform_block_desc =
-        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, InTileSizeH, InTileSizeW>{});
-    constexpr unsigned in_transform_block_size  = in_transform_block_desc.GetElementSpace();
-    constexpr unsigned wei_transform_block_size = wei_transform_block_desc.GetElementSpace();
-    __shared__ TFloat p_in_transform_block[in_transform_block_size];
-    __shared__ TFloat p_wei_transform_block[wei_transform_block_size];
-    // thread data
-    constexpr auto in_transform_thread_block_desc =
-        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{},
-                                      in_transform_block_desc.GetStrides());
-    constexpr auto wei_transform_thread_block_desc =
-        make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, InTileSizeH, InTileSizeW>{},
-                                      wei_transform_block_desc.GetStrides());
-    constexpr auto out_transform_thread_desc =
-        make_ConstantTensorDescriptor(Sequence<NPerThread, KPerThread, InTileSizeH, InTileSizeW>{});
-    constexpr auto out_thread_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerThread, KPerThread, OutTileSizeH, OutTileSizeW>{});
-    constexpr auto out_thread_global_desc =
-        make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_global_desc.GetStrides());
-    constexpr unsigned out_transform_thread_size = out_transform_thread_desc.GetElementSpace();
-    constexpr unsigned out_thread_size           = out_thread_desc.GetElementSpace();
-    TFloat p_out_transform_thread[out_transform_thread_size];
-    TFloat p_out_thread[out_thread_size];
-#if 0
-    if(blockIdx.x == 0 && threadIdx.x == 0)
-    {
-        printf("in_transform_block_size %u, wei_transform_block_size %u, out_transform_thread_size "
-               "%u, out_thread_size %u \n",
-               in_transform_block_size,
-               wei_transform_block_size,
-               out_transform_thread_size,
-               out_thread_size);
-    }
-#endif
-    // set threadwise output transform tensor to 0
-    threadwise_4d_tensor_op_unary<TFloat, decltype(out_transform_thread_desc), decltype(f_set0)>(
-        out_transform_thread_desc, p_out_transform_thread, f_set0);
-    for(unsigned c_block_data_offset = 0; c_block_data_offset < in_global_desc.GetLength(I1);
-        c_block_data_offset += CPerBlock, __syncthreads())
-    {
-        // blockwise transform input
-        blockwise_winograd_transform_input<TFloat,
-                                           InTileSizeH,
-                                           InTileSizeW,
-                                           S,
-                                           R,
-                                           OutTileSizeH,
-                                           OutTileSizeW,
-                                           NPerBlock,
-                                           CPerBlock,
-                                           YPerBlock,
-                                           XPerBlock,
-                                           BlockSize>(
-            p_in_global + in_global_desc.Get1dIndex(n_block_data_offset,
-                                                    c_block_data_offset,
-                                                    hi_block_data_offset,
-                                                    wi_block_data_offset),
-            p_in_transform_block);
-        // blockwise transform weights
-        blockwise_winograd_transform_weight<TFloat,
-                                            InTileSizeH,
-                                            InTileSizeW,
-                                            S,
-                                            R,
-                                            OutTileSizeH,
-                                            OutTileSizeW,
-                                            KPerBlock,
-                                            CPerBlock,
-                                            BlockSize>(
-            p_wei_global +
-                wei_global_desc.Get1dIndex(k_block_data_offset, c_block_data_offset, 0, 0),
-            p_wei_transform_block);
-        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
-        {
-            // threadwise point multiplication
-            threadwise_winograd_calculate_transformed_output<
-                TFloat,
-                decltype(in_transform_thread_block_desc),
-                decltype(wei_transform_thread_block_desc),
-                decltype(out_transform_thread_desc),
-                InTileSizeH,
-                InTileSizeW,
-                S,
-                R,
-                OutTileSizeH,
-                OutTileSizeW>(
-                in_transform_thread_block_desc,
-                p_in_transform_block +
-                    in_transform_block_desc.Get1dIndex(n_thread_data_offset,
-                                                       c_thread_data,
-                                                       y_thread_data_offset * InTileSizeH,
-                                                       x_thread_data_offset * InTileSizeW),
-                wei_transform_thread_block_desc,
-                p_wei_transform_block +
-                    wei_transform_block_desc.Get1dIndex(k_thread_data_offset, c_thread_data, 0, 0),
-                out_transform_thread_desc,
-                p_out_transform_thread);
-        }
-    };
-    // transform back
-    threadwise_winograd_reverse_transform_output<TFloat,
-                                                 decltype(out_transform_thread_desc),
-                                                 decltype(out_thread_desc),
-                                                 InTileSizeH,
-                                                 InTileSizeW,
-                                                 S,
-                                                 R,
-                                                 OutTileSizeH,
-                                                 OutTileSizeW>(
-        out_transform_thread_desc, p_out_transform_thread, out_thread_desc, p_out_thread);
-    // copy output tensor from register to global mem
-    threadwise_4d_tensor_op_binary<TFloat,
-                                   decltype(out_thread_desc),
-                                   decltype(out_thread_global_desc),
-                                   decltype(f_copy)>(
-        out_thread_desc,
-        p_out_thread,
-        out_thread_global_desc,
-        p_out_global +
-            out_global_desc.Get1dIndex(n_block_data_offset + n_thread_data_offset,
-                                       k_block_data_offset + k_thread_data_offset,
-                                       ho_block_data_offset + y_thread_data_offset * OutTileSizeH,
-                                       wo_block_data_offset + x_thread_data_offset * OutTileSizeW),
-        f_copy);
-}
\ No newline at end of file
--- a/src/include/threadwise_direct_convolution.cuh
+++ b/src/include/threadwise_direct_convolution.cuh
@@ -9,10 +9,10 @@ __device__ void threadwise_direct_convolution(InDesc,
                                              OutDesc,
                                              TFloat* __restrict__ p_out)
 {
-    constexpr auto I0 = Index<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Index<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Index<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I3 = Number<3>{};
    constexpr auto in_desc  = InDesc{};
    constexpr auto wei_desc = WeiDesc{};

--- a/src/include/threadwise_tensor_op.cuh
+++ b/src/include/threadwise_tensor_op.cuh
 #pragma once
 #include "constant_tensor_descriptor.cuh"
-#define THREADWISE_TENSOR_OP_METHOD 0
+template <class TFloat, class Desc, class F>
+__device__ void threadwise_4d_tensor_pointwise_op_unary(Desc, TFloat* __restrict__ p_dst, F f)
-#if THREADWISE_TENSOR_OP_METHOD == 0
-template <class TFloat, class DstDesc, class F>
-__device__ void threadwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
 {
-    constexpr auto I0 = Index<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Index<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Index<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I3 = Number<3>{};
-    constexpr auto dst_desc = DstDesc{};
+    constexpr auto desc = Desc{};
 #if 0
    if(threadIdx.x == 0)
    {
-        print_ConstantTensorDescriptor(dst_desc, "threadwise_4d_tensor_op_unary: ");
+        print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_op_unary: ");
    }
 #endif
-    for(unsigned did0 = 0; did0 < dst_desc.GetLength(I0); ++did0)
+    for(unsigned did0 = 0; did0 < desc.GetLength(I0); ++did0)
    {
-        for(unsigned did1 = 0; did1 < dst_desc.GetLength(I1); ++did1)
+        for(unsigned did1 = 0; did1 < desc.GetLength(I1); ++did1)
        {
-            for(unsigned did2 = 0; did2 < dst_desc.GetLength(I2); ++did2)
+            for(unsigned did2 = 0; did2 < desc.GetLength(I2); ++did2)
            {
-                for(unsigned did3 = 0; did3 < dst_desc.GetLength(I3); ++did3)
+                for(unsigned did3 = 0; did3 < desc.GetLength(I3); ++did3)
                {
-                    const unsigned dindex =
+                    const unsigned dindex = desc.Get1dIndex(did0, did1, did2, did3);
-                        dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 +
-                        dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3;
                    f(p_dst[dindex]);
-#if 0
-                    if(threadIdx.x == 0)
-                    {
-                        printf("threadwise_4d_tensor_op_unary: thread id %u, \t"
-                               "dindex %u, p_dst[dindex] %f\n",
-                               threadIdx.x,
-                               dindex,
-                               p_dst[dindex]);
-                    }
-#endif
                }
            }
        }
@@ -52,13 +36,13 @@ __device__ void threadwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_ds
 }
 template <class TFloat, class SrcDesc, class DstDesc, class F>
-__device__ void threadwise_4d_tensor_op_binary(
+__device__ void threadwise_4d_tensor_pointwise_op_binary(
    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
 {
-    constexpr auto I0 = Index<0>{};
+    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Index<1>{};
+    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Index<2>{};
+    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I3 = Number<3>{};
    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
@@ -81,99 +65,34 @@ __device__ void threadwise_4d_tensor_op_binary(
            {
                for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3)
                {
-                    const unsigned sindex =
+                    const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
-                        src_desc.GetStride(I0) * did0 + src_desc.GetStride(I1) * did1 +
-                        src_desc.GetStride(I2) * did2 + src_desc.GetStride(I3) * did3;
-                    const unsigned dindex =
+                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
-                        dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 +
-                        dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3;
                    f(p_src[sindex], p_dst[dindex]);
-#if 0
-                    if(threadIdx.x == 0)
-                    {
-                        printf("threadwise_4d_tensor_op_binary: thread id %u, \t"
-                               "sindex %u, p_src[sindex] %f, \t"
-                               "dindex %u, p_dst[dindex] %f\n",
-                               threadIdx.x,
-                               sindex,
-                               p_src[sindex],
-                               dindex,
-                               p_dst[dindex]);
-                    }
-#endif
                }
            }
        }
    }
 }
-#endif
-#if THREADWISE_TENSOR_OP_METHOD == 1
+template <class TFloat, class Desc>
-template <class TFloat, class SrcDesc, class DstDesc, class F>
+__device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p_dst)
-__device__ void threadwise_4d_tensor_op(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
 {
-    constexpr auto I0 = Index<0>{};
+    auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-#if 0
-    if(threadIdx.x == 0)
-    {
-        print_ConstantTensorDescriptor(src_desc, "threadwise_4d_tensor_op: src_desc: ");
-        print_ConstantTensorDescriptor(dst_desc, "threadwise_4d_tensor_op: dst_desc: ");
-    }
-#endif
-    unsigned sindex = 0;
+    threadwise_4d_tensor_pointwise_op_unary<TFloat, Desc, decltype(f_set_zero)>(
-    unsigned dindex = 0;
+        Desc{}, p_dst, f_set_zero);
+}
-    for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0)
-    {
-        for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1)
-        {
-            for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2)
-            {
-                for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3)
-                {
-                    f(p_src[sindex], p_dst[dindex]);
-#if 0
-                    if(threadIdx.x == 0)
-                    {
-                        printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
-                               "sindex %u, p_src[sindex] %f, \t"
-                               "dindex %u, p_dst[dindex] %f\n",
-                               threadIdx.x,
-                               sindex,
-                               p_src[sindex],
-                               dindex,
-                               p_dst[dindex]);
-                    }
-#endif
-                    sindex += src_desc.GetStride(I3);
-                    dindex += dst_desc.GetStride(I3);
-                }
-                sindex += src_desc.GetStride(I2) - src_desc.GetLength(I3) * src_desc.GetStride(I3);
-                dindex += dst_desc.GetStride(I2) - dst_desc.GetLength(I3) * dst_desc.GetStride(I3);
-            }
-            sindex += src_desc.GetStride(I1) - src_desc.GetLength(I2) * src_desc.GetStride(I2);
+template <class TFloat, class SrcDesc, class DstDesc>
-            dindex += dst_desc.GetStride(I1) - dst_desc.GetLength(I2) * dst_desc.GetStride(I2);
+__device__ void threadwise_4d_tensor_copy(SrcDesc,
-        }
+                                          TFloat* const __restrict__ p_src,
+                                          DstDesc,
+                                          TFloat* __restrict__ p_dst)
+{
+    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
-        sindex += src_desc.GetStride(I0) - src_desc.GetLength(I1) * src_desc.GetStride(I1);
+    threadwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, decltype(f_copy)>(
-        dindex += dst_desc.GetStride(I0) - dst_desc.GetLength(I1) * dst_desc.GetStride(I1);
+        SrcDesc{}, p_src, DstDesc{}, p_dst, f_copy);
-    }
+}
-}
\ No newline at end of file
-#endif
--- a/src/include/threadwise_winograd_transform.cuh
+++ b/src/include/threadwise_winograd_transform.cuh
-#pragma once
-#include "constant_tensor_descriptor.cuh"
-template <class TFloat,
-          class InTransThreadDesc,  //{NPerThread, CPerThread, InTileSizeH, InTileSizeW}
-          class WeiTransThreadDesc, //{KPerThread, CPerThread, InTileSizeH, InTileSizeW}
-          class OutTransThreadDesc, //{NPerThread, KPerThread, InTileSizeH, InTileSizeW}
-          unsigned InTileSizeH,
-          unsigned InTileSizeW,
-          unsigned S,
-          unsigned R,
-          unsigned OutTileSizeH,
-          unsigned OutTileSizeW>
-__device__ void
-threadwise_winograd_calculate_transformed_output(InTransThreadDesc,
-                                                 TFloat* const __restrict__ p_in_transform_thread,
-                                                 WeiTransThreadDesc,
-                                                 TFloat* const __restrict__ p_wei_transform_thread,
-                                                 OutTransThreadDesc,
-                                                 TFloat* __restrict__ p_out_transform_thread)
-{
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-    constexpr auto in_transform_thread_desc  = InTransThreadDesc{};
-    constexpr auto wei_transform_thread_desc = WeiTransThreadDesc{};
-    constexpr auto out_transform_thread_desc = OutTransThreadDesc{};
-    for(unsigned n = 0; n < out_transform_thread_desc.GetLength(I0); ++n)
-    {
-        for(unsigned k = 0; k < out_transform_thread_desc.GetLength(I1); ++k)
-        {
-            for(unsigned h = 0; h < out_transform_thread_desc.GetLength(I2); ++h)
-            {
-                for(unsigned w = 0; w < out_transform_thread_desc.GetLength(I3); ++w)
-                {
-                    for(unsigned c = 0; c < wei_transform_thread_desc.GetLength(I1); ++c)
-                    {
-                        const unsigned in_index  = in_transform_thread_desc.Get1dIndex(n, c, h, w);
-                        const unsigned wei_index = wei_transform_thread_desc.Get1dIndex(k, c, h, w);
-                        const unsigned out_index = out_transform_thread_desc.Get1dIndex(n, k, h, w);
-                        p_out_transform_thread[out_index] +=
-                            p_wei_transform_thread[wei_index] * p_in_transform_thread[in_index];
-                    }
-                }
-            }
-        }
-    }
-}
-template <class TFloat,
-          class OutTransThreadDesc, //{NPerThread, KPerThread,  InTileSizeH,  InTileSizeW}
-          class OutThreadDesc,      //{NPerThread, CPerThread, OutTileSizeH, OutTileSizeW}
-          unsigned InTileSizeH,
-          unsigned InTileSizeW,
-          unsigned S,
-          unsigned R,
-          unsigned OutTileSizeH,
-          unsigned OutTileSizeW>
-__device__ void
-threadwise_winograd_reverse_transform_output(OutTransThreadDesc,
-                                             TFloat* const __restrict__ p_out_transform_thread,
-                                             OutThreadDesc,
-                                             TFloat* __restrict__ p_out_thread)
-{
-    static_assert(InTileSizeH == 4, "wrong");
-    static_assert(InTileSizeW == 4, "wrong");
-    static_assert(S == 3, "wrong");
-    static_assert(R == 3, "wrong");
-    static_assert(OutTileSizeH == 2, "wrong");
-    static_assert(OutTileSizeW == 2, "wrong");
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-    constexpr auto out_transform_thread_desc = OutTransThreadDesc{};
-    constexpr auto out_thread_desc           = OutThreadDesc{};
-    static_assert(InTileSizeH == out_transform_thread_desc.GetLength(I2), "wrong");
-    static_assert(InTileSizeW == out_transform_thread_desc.GetLength(I3), "wrong");
-    static_assert(OutTileSizeH == out_thread_desc.GetLength(I2), "wrong");
-    static_assert(OutTileSizeW == out_thread_desc.GetLength(I3), "wrong");
-    for(unsigned n = 0; n < out_thread_desc.GetLength(I0); ++n)
-    {
-        for(unsigned k = 0; k < out_thread_desc.GetLength(I1); ++k)
-        {
-            p_out_thread[out_thread_desc.Get1dIndex(n, k, 0, 0)] =
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 0)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 1)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 2)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 0)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 1)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 2)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 0)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 1)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 2)];
-            p_out_thread[out_thread_desc.Get1dIndex(n, k, 0, 1)] =
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 1)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 2)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 3)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 1)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 2)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 3)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 1)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 2)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 3)];
-            p_out_thread[out_thread_desc.Get1dIndex(n, k, 1, 0)] =
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 0)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 1)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 2)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 0)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 1)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 2)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 0)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 1)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 2)];
-            p_out_thread[out_thread_desc.Get1dIndex(n, k, 1, 1)] =
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 1)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 2)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 3)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 1)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 2)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 3)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 1)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 2)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 3)];
-        }
-    }
-}
\ No newline at end of file