clean up

1c62b47b · Chao Liu · c9b52bce · c9b52bce · c9b52bce · c9b52bce
Commit 1c62b47b authored Feb 22, 2021 by Chao Liu
10 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
-#ifndef CK_GRIDWISE_COL2IM_EB_NCHW_HPP
-#define CK_GRIDWISE_COL2IM_EB_NCHW_HPP
-#include "common_header.hpp"
-#include "tensor_descriptor.hpp"
-#include "tensor_descriptor_helper.hpp"
-#include "blockwise_generic_tensor_slice_copy.hpp"
-namespace ck {
-// B = merge(N, Ho, Wo)
-template <index_t GridSize,
-          index_t BlockSize,
-          typename Float,
-          typename ColGlobalDesc,
-          typename ImgGlobalDesc,
-          typename FilterSizes,
-          typename OutputSizes,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename LeftPads,
-          typename RightPads,
-          index_t EPerBlock,
-          index_t BPerBlock,
-          typename BlockCopySubLengths_E_B,
-          typename BlockCopyClusterLengths_E_B,
-          typename BlockCopyThreadClusterArrangeOrder,
-          typename BlockCopySrcAccessOrder,
-          typename BlockCopyDstAccessOrder,
-          index_t BlockCopyDataPerAccess_B>
-struct GridwiseCol2Im_eb_nchw
-{
-    __device__ void Run(const Float* const __restrict__ p_col_global,
-                        Float* const __restrict__ p_img_global) const
-    {
-        constexpr auto col_e_b_global_desc       = ColGlobalDesc{};
-        constexpr auto img_n_c_hi_wi_global_desc = ImgGlobalDesc{};
-        constexpr index_t N  = img_n_c_hi_wi_global_desc.GetLengths()[0];
-        constexpr index_t C  = img_n_c_hi_wi_global_desc.GetLengths()[1];
-        constexpr index_t Hi = img_n_c_hi_wi_global_desc.GetLengths()[2];
-        constexpr index_t Wi = img_n_c_hi_wi_global_desc.GetLengths()[3];
-        constexpr index_t Ho = OutputSizes{}[0];
-        constexpr index_t Wo = OutputSizes{}[1];
-        constexpr index_t Y = FilterSizes{}[0];
-        constexpr index_t X = FilterSizes{}[1];
-        constexpr index_t ConvStrideH = ConvStrides{}[0];
-        constexpr index_t ConvStrideW = ConvStrides{}[1];
-        constexpr index_t ConvDilationH = ConvDilations{}[0];
-        constexpr index_t ConvDilationW = ConvDilations{}[1];
-        constexpr index_t E = C * Y * X;
-        constexpr index_t B = N * Ho * Wo;
-        // sanity-check for vectorized memory load
-        static_assert((Wo == 1 || (ConvStrideW == 1 || BlockCopyDataPerAccess_B == 1)) &&
-                          (X == 1 || ConvDilationW % BlockCopyDataPerAccess_B == 0),
-                      "wrong! aligment requirement for vectorized global load of input tensor will "
-                      "be violated");
-        // divide block work by [E, B]
-        static_assert(E % EPerBlock == 0 && B % BPerBlock == 0,
-                      "wrong! cannot divide work evenly among block");
-        constexpr index_t EBlockWork = E / EPerBlock;
-        constexpr index_t BBlockWork = B / BPerBlock;
-        constexpr auto block_work_desc =
-            make_cluster_descriptor(Sequence<EBlockWork, BBlockWork>{});
-        const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
-        const index_t e_block_data_on_global = block_work_id[Number<0>{}] * EPerBlock;
-        const index_t b_block_data_on_global = block_work_id[Number<1>{}] * BPerBlock;
-        // construct img_eb_global_desc
-        constexpr auto img_n_c_hip_wip_global_desc = transform_tensor_descriptor(
-            img_n_c_hi_wi_global_desc,
-            make_tuple(
-                PassThrough<N>{}, PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
-        constexpr index_t Hip = img_n_c_hip_wip_global_desc.GetLengths()[2];
-        constexpr index_t Wip = img_n_c_hip_wip_global_desc.GetLengths()[3];
-        constexpr auto img_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
-            img_n_c_hip_wip_global_desc,
-            make_tuple(PassThrough<N>{},
-                       PassThrough<C>{},
-                       Embed<Hip, Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
-                       Embed<Wip, Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-        constexpr auto img_e_b_global_desc = transform_tensor_descriptor(
-            img_n_c_y_ho_x_wo_global_desc,
-            make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
-            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
-        // blockwise atomic accumulation
-        auto blockwise_copy = BlockwiseGenericTensorSliceCopy_v4<BlockSize,
-                                                                 decltype(col_e_b_global_desc),
-                                                                 decltype(img_e_b_global_desc),
-                                                                 Sequence<EPerBlock, BPerBlock>,
-                                                                 BlockCopySubLengths_E_B,
-                                                                 BlockCopyClusterLengths_E_B,
-                                                                 BlockCopyThreadClusterArrangeOrder,
-                                                                 BlockCopySrcAccessOrder,
-                                                                 BlockCopyDstAccessOrder,
-                                                                 1,
-                                                                 1,
-                                                                 BlockCopyDataPerAccess_B,
-                                                                 BlockCopyDataPerAccess_B,
-                                                                 AddressSpace::Vgpr,
-                                                                 AddressSpace::Vgpr,
-                                                                 AddressSpace::Global,
-                                                                 InMemoryDataOperation::AtomicAdd>(
-            make_multi_index(e_block_data_on_global, b_block_data_on_global),
-            make_multi_index(e_block_data_on_global, b_block_data_on_global));
-        // blockwise copy
-        blockwise_copy.Run(p_col_global, p_img_global);
-    }
-};
-} // namespace ck
-#endif
--- a/composable_kernel/include/kernel_algorithm/gridwise_dynamic_col2im_gemmkgemmn_nchw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_dynamic_col2im_gemmkgemmn_nchw.hpp
-#ifndef CK_GRIDWISE_DYNAMIC_COL2IM_GEMMKGEMMN_NCHW_HPP
-#define CK_GRIDWISE_DYNAMIC_COL2IM_GEMMKGEMMN_NCHW_HPP
-#include "common_header.hpp"
-#include "dynamic_tensor_descriptor.hpp"
-#include "dynamic_tensor_descriptor_helper.hpp"
-#include "blockwise_dynamic_tensor_slice_transfer.hpp"
-namespace ck {
-template <typename... In>
-__host__ __device__ constexpr auto
-map_img_into_col(const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
-                 const MultiIndex<2> out_sizes,
-                 const MultiIndex<2> filter_sizes,
-                 const MultiIndex<2> conv_strides,
-                 const MultiIndex<2> conv_dilations,
-                 const MultiIndex<2> in_left_pads,
-                 const MultiIndex<2> in_right_pads)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    const index_t N  = in_n_c_hi_wi_global_desc.GetLength(I0);
-    const index_t C  = in_n_c_hi_wi_global_desc.GetLength(I1);
-    const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
-    const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
-    const index_t Ho = out_sizes[I0];
-    const index_t Wo = out_sizes[I1];
-    const index_t Y = filter_sizes[I0];
-    const index_t X = filter_sizes[I1];
-    const index_t ConvStrideH = conv_strides[I0];
-    const index_t ConvStrideW = conv_strides[I1];
-    const index_t ConvDilationH = conv_dilations[I0];
-    const index_t ConvDilationW = conv_dilations[I1];
-    const index_t InLeftPadH = in_left_pads[I0];
-    const index_t InLeftPadW = in_left_pads[I1];
-    const index_t InRightPadH = in_right_pads[I0];
-    const index_t InRightPadW = in_right_pads[I1];
-    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
-        transform_dynamic_tensor_descriptor(
-            in_n_c_hi_wi_global_desc,
-            make_tuple(DynamicPassThrough{N},
-                       DynamicPassThrough{C},
-                       DynamicLeftPad{Hi, InLeftPadH},
-                       DynamicLeftPad{Wi, InLeftPadW}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
-        make_tuple(DynamicPassThrough{N},
-                   DynamicPassThrough{C},
-                   DynamicRightPad{Hi + InLeftPadH, InRightPadH},
-                   DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-    const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(I2);
-    const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(I3);
-    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
-        in_n_c_hip_wip_global_desc,
-        make_tuple(
-            DynamicPassThrough{N},
-            DynamicPassThrough{C},
-            DynamicEmbed<2>{make_multi_index(Y, Ho), make_multi_index(ConvDilationH, ConvStrideH)},
-            DynamicEmbed<2>{make_multi_index(X, Wo), make_multi_index(ConvDilationW, ConvStrideW)}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
-    const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
-        in_n_c_y_ho_x_wo_global_desc,
-        make_tuple(DynamicMerge<3>{make_multi_index(C, Y, X)},
-                   DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
-        make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-        make_tuple(Sequence<0>{}, Sequence<1>{}));
-    return in_gemmk_gemmn_global_desc;
-}
-template <index_t BlockSize,
-          index_t GemmKPerBlock,
-          index_t GemmNPerBlock,
-          typename BlockCopySubLengths_GemmK_GemmN,
-          typename BlockCopyClusterLengths_GemmK_GemmN,
-          typename BlockCopyThreadClusterArrangeOrder,
-          typename BlockCopySrcAccessOrder,
-          typename BlockCopyDstAccessOrder,
-          index_t BlockCopyDataPerAccess_GemmN>
-struct GridwiseDynamicCol2Im_gemmkgemmn_nchw
-{
-    // this version has scratch memory issue, due to:
-    // 1. ThreadwiseDynamicTensorSliceTransfer_v1r1 keeps reference to tensor descriptor
-    // 2. threadwise_dynamic_tensor_slice_transfer_v1r1 constructs new tensor coordinate
-    template <typename... Col, typename... Img>
-    __device__ void Run_r1(const float* const __restrict__ p_col_global,
-                           float* const __restrict__ p_img_global,
-                           const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
-                           const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        const index_t GemmK = col_gemmk_gemmn_global_desc.GetLength(I0);
-        const index_t GemmN = col_gemmk_gemmn_global_desc.GetLength(I1);
-        // divide block work by GemmN
-        const index_t GemmNBlockWork = GemmN / GemmNPerBlock;
-        const index_t block_work_id = get_block_1d_id();
-        const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
-        auto blockwise_copy =
-#if 1
-            BlockwiseDynamicTensorSliceTransfer_v1r1<BlockSize,
-                                                     float,
-                                                     float,
-                                                     decltype(col_gemmk_gemmn_global_desc),
-                                                     decltype(img_gemmk_gemmn_global_desc),
-                                                     Sequence<GemmKPerBlock, GemmNPerBlock>,
-                                                     BlockCopySubLengths_GemmK_GemmN,
-                                                     BlockCopyClusterLengths_GemmK_GemmN,
-                                                     BlockCopyThreadClusterArrangeOrder,
-                                                     BlockCopySrcAccessOrder,
-                                                     1,
-                                                     BlockCopyDataPerAccess_GemmN,
-                                                     BlockCopyDataPerAccess_GemmN,
-                                                     AddressSpace::Global,
-                                                     AddressSpace::Global,
-                                                     InMemoryDataOperation::AtomicAdd,
-                                                     1,
-                                                     1>
-#elif 1
-            BlockwiseDynamicTensorSliceTransfer_v2r1<BlockSize,
-                                                     float,
-                                                     float,
-                                                     decltype(col_gemmk_gemmn_global_desc),
-                                                     decltype(img_gemmk_gemmn_global_desc),
-                                                     Sequence<GemmKPerBlock, GemmNPerBlock>,
-                                                     BlockCopySubLengths_GemmK_GemmN,
-                                                     BlockCopyClusterLengths_GemmK_GemmN,
-                                                     BlockCopyThreadClusterArrangeOrder,
-                                                     BlockCopySrcAccessOrder,
-                                                     BlockCopyDstAccessOrder,
-                                                     1,
-                                                     1,
-                                                     BlockCopyDataPerAccess_GemmN,
-                                                     BlockCopyDataPerAccess_GemmN,
-                                                     AddressSpace::Global,
-                                                     AddressSpace::Global,
-                                                     InMemoryDataOperation::AtomicAdd,
-                                                     1,
-                                                     1>
-#endif
-            (col_gemmk_gemmn_global_desc,
-             make_multi_index(0, gemmn_block_data_on_global),
-             img_gemmk_gemmn_global_desc,
-             make_multi_index(0, gemmn_block_data_on_global));
-        for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
-        {
-            blockwise_copy.Run(p_col_global, p_img_global);
-            blockwise_copy.MoveSrcSliceWindow(make_multi_index(GemmKPerBlock, 0));
-            blockwise_copy.MoveDstSliceWindow(make_multi_index(GemmKPerBlock, 0));
-        }
-    }
-    // this version does not have scratch memory issue, due to:
-    // 1. ThreadwiseDynamicTensorSliceTransfer_v1r2 does not keep reference to tensor descriptor
-    // 2. threadwise_dynamic_tensor_slice_transfer_v1r2 does not construct new tensor coordinate
-    template <typename... Col, typename... Img>
-    __device__ void Run_r2(const float* const __restrict__ p_col_global,
-                           float* const __restrict__ p_img_global,
-                           const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
-                           const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        const index_t GemmK = col_gemmk_gemmn_global_desc.GetLength(I0);
-        const index_t GemmN = col_gemmk_gemmn_global_desc.GetLength(I1);
-        // divide block work by GemmN
-        const index_t GemmNBlockWork = GemmN / GemmNPerBlock;
-        const index_t block_work_id = get_block_1d_id();
-        const index_t gemmn_block_data_on_global = block_work_id * GemmNPerBlock;
-        auto blockwise_copy =
-            BlockwiseDynamicTensorSliceTransfer_v2r2<BlockSize,
-                                                     float,
-                                                     float,
-                                                     decltype(col_gemmk_gemmn_global_desc),
-                                                     decltype(img_gemmk_gemmn_global_desc),
-                                                     Sequence<GemmKPerBlock, GemmNPerBlock>,
-                                                     BlockCopySubLengths_GemmK_GemmN,
-                                                     BlockCopyClusterLengths_GemmK_GemmN,
-                                                     BlockCopyThreadClusterArrangeOrder,
-                                                     BlockCopySrcAccessOrder,
-                                                     BlockCopyDstAccessOrder,
-                                                     1,
-                                                     1,
-                                                     BlockCopyDataPerAccess_GemmN,
-                                                     BlockCopyDataPerAccess_GemmN,
-                                                     AddressSpace::Global,
-                                                     AddressSpace::Global,
-                                                     InMemoryDataOperation::AtomicAdd,
-                                                     1,
-                                                     1>(
-                col_gemmk_gemmn_global_desc,
-                make_multi_index(0, gemmn_block_data_on_global),
-                img_gemmk_gemmn_global_desc,
-                make_multi_index(0, gemmn_block_data_on_global));
-        for(index_t gemmk = 0; gemmk < GemmK; gemmk += GemmKPerBlock)
-        {
-            blockwise_copy.Run(col_gemmk_gemmn_global_desc,
-                               p_col_global,
-                               img_gemmk_gemmn_global_desc,
-                               p_img_global);
-            blockwise_copy.MoveSrcSliceWindow(col_gemmk_gemmn_global_desc,
-                                              make_multi_index(GemmKPerBlock, 0));
-            blockwise_copy.MoveDstSliceWindow(img_gemmk_gemmn_global_desc,
-                                              make_multi_index(GemmKPerBlock, 0));
-        }
-    }
-    template <typename... Col, typename... Img>
-    __device__ void Run(const float* const __restrict__ p_col_global,
-                        float* const __restrict__ p_img_global,
-                        const DynamicTensorDescriptor<Col...>& col_gemmk_gemmn_global_desc,
-                        const DynamicTensorDescriptor<Img...>& img_gemmk_gemmn_global_desc) const
-    {
-        Run_r2(
-            p_col_global, p_img_global, col_gemmk_gemmn_global_desc, img_gemmk_gemmn_global_desc);
-    }
-};
-} // namespace ck
-#endif
--- a/composable_kernel/include/tensor_description/dynamic_tensor_coordinate_v1.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_coordinate_v1.hpp
-#ifndef CK_DYNAMIC_TENSOR_COORDINATE_V1_HPP
-#define CK_DYNAMIC_TENSOR_COORDINATE_V1_HPP
-#include "common_header.hpp"
-#include "dynamic_tensor_descriptor_v1.hpp"
-namespace ck {
-// A "tensor cooridnate" is an opaque object that represents a "point of location" inside a tensor
-// At the bare minimun, user should be able to query the following information from a tensor
-// coordinate:
-//   1. Tensor descriptor
-//   2. Location, represented in the form of multi-index
-//   3. Location, represented in the form of the offset to the origin of the tensor
-//   4. If the location is inside invalid area or not, e.g. the padding area of an implicitly padded
-//      tensor is considered invalid, because the padding area doesn't have any physical memory
-//      allocation
-// A tensor cooridnate also provides following functionality:
-//   1. Given step size in each dimension, update itself, or return a new tensor cooridnate, so user
-//      can freely move the "point of location" inside the tensor
-// wrapper class for DynamicNativeTensorCoordinate_v1 and DynamicTransformedTensorCoordinate_v1
-template <typename TensorDesc>
-struct DynamicTensorCoordinate_v1;
-// tensor coordinate for native tensor
-template <typename TensorDesc>
-struct DynamicNativeTensorCoordinate_v1
-{
-    using type                    = DynamicNativeTensorCoordinate_v1;
-    using tensor_desc_type        = TensorDesc;
-    static constexpr index_t NDim = tensor_desc_type::GetNumOfDimension();
-    using Index                   = MultiIndex<NDim>;
-    __host__
-        __device__ constexpr DynamicNativeTensorCoordinate_v1(const tensor_desc_type& tensor_desc,
-                                                              const Index& idx)
-        : tensor_desc_{tensor_desc}, idx_{idx}, offset_{tensor_desc.CalculateOffset(idx)}
-    {
-    }
-    __host__ __device__ constexpr auto GetTensorDescriptor() const { return tensor_desc_; }
-    __host__ __device__ constexpr const auto& GetUpperIndex() const { return idx_; }
-    __host__ __device__ constexpr const auto& GetIndex() const { return idx_; }
-    __host__ __device__ constexpr const index_t& GetOffset() const { return offset_; }
-    __host__ __device__ constexpr type operator+=(const Index& idx_diff)
-    {
-        // idx_ is updated here, but some (or all) of its entries may never be used
-        // compiler should remove those entries as dead code
-        idx_ += idx_diff;
-        offset_ += tensor_desc_.CalculateOffsetDiff(idx_diff);
-        return *this;
-    }
-    __host__ __device__ constexpr type operator-=(const Index& idx_diff)
-    {
-        // idx_ is updated here, but some (or all) of its entries may never be used
-        // compiler should remove those entries as dead code
-        idx_ -= idx_diff;
-        offset_ -= tensor_desc_.CalculateOffsetDiff(idx_diff);
-        return *this;
-    }
-    __host__ __device__ constexpr type operator+(const Index& idx_diff) const
-    {
-        type coord = *this;
-        coord += idx_diff;
-        return coord;
-    }
-    __host__ __device__ constexpr type operator-(const Index& idx_diff) const
-    {
-        type coord = *this;
-        coord -= idx_diff;
-        return coord;
-    }
-    __host__ __device__ constexpr index_t CalculateOffsetDiff(const Index& idx_diff) const
-    {
-        return tensor_desc_.CalculateOffsetDiff(idx_diff);
-    }
-    // evaluated at run-time
-    __host__ __device__ constexpr bool IsUpperIndexValid() const
-    {
-        return tensor_desc_.IsUpperIndexValid(idx_);
-    }
-    // evaluated at run-time
-    __host__ __device__ constexpr bool IsOffsetValid() const
-    {
-        // For native tensor, offset is valid if upper-index is valid
-        return IsUpperIndexValid();
-    }
-    // evaluated at compile-time
-    __host__ __device__ static constexpr bool IsOffsetValidAssumingUpperIndexIsValid()
-    {
-        return true;
-    }
-    private:
-    const tensor_desc_type tensor_desc_;
-    // idx_ may be saved and updated, however, the value of some (or all) of its entries may
-    //   never be used. Compiler should be able to remove these entries as well as its calculation
-    //   as dead code.
-    // TODO: make sure compiler indeed remove these dead code
-    Index idx_;
-    index_t offset_;
-};
-// tensor coordinate for transformed tensor
-template <typename TensorDesc>
-struct DynamicTransformedTensorCoordinate_v1
-{
-    static constexpr index_t NDimUp = TensorDesc::GetNumOfDimension();
-    using UpperDesc                 = TensorDesc;
-    using UpperCoord                = DynamicTransformedTensorCoordinate_v1;
-    using UpperIndex                = MultiIndex<NDimUp>;
-    using LowerDesc  = typename UpperDesc::LowerDesc;
-    using LowerCoord = typename DynamicTensorCoordinate_v1<LowerDesc>::type;
-    __host__
-        __device__ constexpr DynamicTransformedTensorCoordinate_v1(const UpperDesc& tensor_desc_up,
-                                                                   const UpperIndex& idx_up)
-        : tensor_desc_up_{tensor_desc_up},
-          idx_up_{idx_up},
-          coord_low_{tensor_desc_up.GetLowerTensorDescriptor(),
-                     tensor_desc_up.CalculateLowerIndex(idx_up)}
-    {
-    }
-    __host__ __device__ constexpr auto GetTensorDescriptor() const { return tensor_desc_up_; }
-    __host__ __device__ constexpr const LowerCoord& GetLowerCoordinate() const
-    {
-        return coord_low_;
-    }
-    __host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return idx_up_; }
-    __host__ __device__ constexpr const UpperIndex& GetIndex() const { return idx_up_; }
-    __host__ __device__ constexpr const index_t& GetOffset() const
-    {
-        return GetLowerCoordinate().GetOffset();
-    }
-    __host__ __device__ constexpr UpperCoord operator+=(const UpperIndex& idx_up_diff)
-    {
-        // For transformation of multi-index difference, not all transformation functions need to
-        //   know the old lower-index or the old upper-index. We pass both of them to the
-        //   transformation function. The transformation function itself decides to use them or not.
-        coord_low_ += tensor_desc_up_.CalculateLowerIndexDiff(
-            idx_up_diff, GetLowerCoordinate().GetIndex(), GetIndex());
-        // idx_up_ is updated here, but some (or all) of its entries may never be used
-        // compiler should remove those entries as dead code
-        idx_up_ += idx_up_diff;
-        return *this;
-    }
-    __host__ __device__ constexpr UpperCoord operator-=(const UpperIndex& idx_up_diff)
-    {
-        coord_low_ -= tensor_desc_up_.CalculateLowerIndexDiff(
-            idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
-        // mIndex is updated here, but some (or all) of its entries may never be used
-        // compiler should remove those entries as dead code
-        idx_up_ -= idx_up_diff;
-        return *this;
-    }
-    __host__ __device__ constexpr UpperCoord operator+(const UpperIndex& idx_up_diff) const
-    {
-        UpperCoord coord_up = *this;
-        coord_up += idx_up_diff;
-        return coord_up;
-    }
-    __host__ __device__ constexpr UpperCoord operator-(const UpperIndex& idx_up_diff) const
-    {
-        UpperCoord coord_up = *this;
-        coord_up -= idx_up_diff;
-        return coord_up;
-    }
-    // Calculate offset diff without updating tensor-coordinate
-    // If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
-    //   then all calculation can be done at compile-time.
-    // TODO: this function is not compiled to expected ISA
-    __host__ __device__ constexpr index_t CalculateOffsetDiff(const UpperIndex& idx_up_diff) const
-    {
-        // For transformation of multi-index difference, not all transformation functions need to
-        //   know the old lower-index or the old upper-index. We pass both of them to the
-        //   transformation function. The transformation function itself decides to use them or not.
-        const auto idx_low_diff =
-            tensor_desc_up_.CalculateLowerIndexDiff(idx_up_diff, coord_low_.GetIndex(), idx_up_);
-        return coord_low_.CalculateOffsetDiff(idx_low_diff);
-    }
-    // evaluated at run-time
-    __host__ __device__ constexpr bool IsUpperIndexValid() const
-    {
-        return tensor_desc_up_.IsUpperIndexValid(idx_up_);
-    }
-    // evaluted at run-time
-    __host__ __device__ constexpr bool IsOffsetValid() const
-    {
-        return IsUpperIndexValid() && coord_low_.IsOffsetValidAssumingUpperIndexIsValid();
-    }
-    // most evaluatation is done at comile-time
-    __host__ __device__ constexpr bool IsOffsetValidAssumingUpperIndexIsValid() const
-    {
-        return tensor_desc_up_.IsValidUpperIndexMappedToValidLowerIndex(idx_up_) &&
-               coord_low_.IsOffsetValidAssumingUpperIndexIsValid();
-    }
-    private:
-    const UpperDesc tensor_desc_up_;
-    // idx_up_ may be calculated and updated, however, the value of some (or all) of its entries
-    // may never be used. Compiler should be able to remove these entries as well as its calculation
-    // as dead code.
-    // TODO: make sure compiler indeed remove these dead code
-    UpperIndex idx_up_;
-    LowerCoord coord_low_;
-};
-template <index_t NDim>
-__host__ __device__ constexpr auto
-make_dynamic_tensor_coordinate_v1(const DynamicNativeTensorDescriptor_v1<NDim>& tensor_desc,
-                                  const MultiIndex<NDim>& idx)
-{
-    return DynamicNativeTensorCoordinate_v1<DynamicNativeTensorDescriptor_v1<NDim>>{tensor_desc,
-                                                                                    idx};
-}
-template <index_t NDim, typename... Ts>
-__host__ __device__ constexpr auto
-make_dynamic_tensor_coordinate_v1(const DynamicTransformedTensorDescriptor_v1<Ts...>& tensor_desc,
-                                  const MultiIndex<NDim>& idx)
-{
-    static_assert(DynamicTransformedTensorDescriptor_v1<Ts...>::GetNumOfDimension() == NDim,
-                  "wrong! inconsistent # of dimensions");
-    return DynamicTransformedTensorCoordinate_v1<DynamicTransformedTensorDescriptor_v1<Ts...>>{
-        tensor_desc, idx};
-}
-template <typename TensorDesc>
-struct DynamicTensorCoordinate_v1
-{
-    static constexpr index_t NDim = TensorDesc::GetNumOfDimension();
-    using type =
-        decltype(make_dynamic_tensor_coordinate_v1<NDim>(TensorDesc{}, MultiIndex<NDim>{}));
-};
-} // namespace ck
-#endif
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v1.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v1.hpp
-#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_V1_HPP
-#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_V1_HPP
-#include "common_header.hpp"
-#include "dynamic_tensor_descriptor_v1.hpp"
-namespace ck {
-template <typename Lengths, typename Strides>
-__host__ __device__ constexpr auto make_dynamic_naive_tensor_descriptor_v1(const Lengths& lengths,
-                                                                           const Strides& strides)
-{
-    static_assert(Lengths::Size() == Strides::Size(), "wrong! Size not the same");
-    return DynamicNativeTensorDescriptor_v1<Lengths::Size()>(lengths, strides);
-}
-template <typename LowTensorDescriptor,
-          typename Transforms,
-          typename LowDimensionIds,
-          typename UpDimensionIds>
-__host__ __device__ constexpr auto
-transform_dynamic_tensor_descriptor_v1(const LowTensorDescriptor& low_tensor_desc,
-                                       const Transforms& transforms,
-                                       LowDimensionIds,
-                                       UpDimensionIds)
-{
-    return DynamicTransformedTensorDescriptor_v1<LowTensorDescriptor,
-                                                 Transforms,
-                                                 LowDimensionIds,
-                                                 UpDimensionIds>{low_tensor_desc, transforms};
-}
-} // namespace ck
-#endif
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v1.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v1.hpp
-#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_V1_HPP
-#define CK_DYNAMIC_TENSOR_DESCRIPTOR_V1_HPP
-#include "common_header.hpp"
-#include "dynamic_multi_index_transform.hpp"
-namespace ck {
-template <index_t NDim>
-struct DynamicNativeTensorDescriptor_v1
-{
-    using Index = MultiIndex<NDim>;
-    const Index lengths_;
-    const Index strides_;
-    __host__ __device__ constexpr DynamicNativeTensorDescriptor_v1(const Index& lengths,
-                                                                   const Index& strides)
-        : lengths_{lengths}, strides_{strides}
-    {
-    }
-    __host__ __device__ constexpr DynamicNativeTensorDescriptor_v1()
-        : lengths_{make_zero_multi_index<NDim>()}, strides_{make_zero_multi_index<NDim>()}
-    {
-    }
-    __host__ __device__ static constexpr index_t GetNumOfDimension() { return NDim; }
-    __host__ __device__ constexpr auto GetLengths() const { return lengths_; }
-    __host__ __device__ constexpr auto GetStrides() const { return strides_; }
-    template <index_t IDim>
-    __host__ __device__ constexpr index_t GetLength(Number<IDim>) const
-    {
-        return lengths_[Number<IDim>{}];
-    }
-    template <index_t IDim>
-    __host__ __device__ constexpr index_t GetStride(Number<IDim>) const
-    {
-        return strides_[Number<IDim>{}];
-    }
-    __host__ __device__ constexpr index_t GetElementSize() const
-    {
-        return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
-    }
-    __host__ __device__ constexpr index_t GetElementSpace() const
-    {
-        index_t space = 1;
-        static_for<0, NDim, 1>{}([&](auto i) { space += (GetLength(i) - 1) * GetStride(i); });
-        return space;
-    }
-    template <typename Idx>
-    __host__ __device__ constexpr index_t CalculateOffset(const Idx& idx) const
-    {
-        index_t offset = 0;
-        static_for<0, NDim, 1>{}([&](auto i) { offset += idx[i] * GetStride(i); });
-        return offset;
-    }
-    template <typename IdxDiff>
-    __host__ __device__ constexpr index_t CalculateOffsetDiff(const IdxDiff& idx_diff) const
-    {
-        return CalculateOffset(idx_diff);
-    }
-    template <typename Idx>
-    __host__ __device__ constexpr bool IsUpperIndexValid(const Idx& idx) const
-    {
-        bool flag = true;
-        static_for<0, NDim, 1>{}(
-            [&](auto i) { flag = flag && idx[i] >= 0 && idx[i] < GetLength(i); });
-        return flag;
-    }
-};
-template <typename LowTensorDescriptor, // DynamicNativeTensorDescriptor_v1 or
-                                        // DynamicTransformedTensorDescriptor_v1
-          typename Transforms,          // Tuple<MultIndexTransforms...>
-          typename LowDimensionIds,     // Tuple<Sequence<...>>
-          typename UpDimensionIds>      // Tuple<Sequence<...>>
-struct DynamicTransformedTensorDescriptor_v1
-{
-    using LowerDesc = LowTensorDescriptor;
-    using UpperDesc = DynamicTransformedTensorDescriptor_v1;
-    static constexpr index_t NTransform = Transforms::Size();
-    const LowerDesc low_tensor_desc_;
-    const Transforms transforms_;
-    __host__ __device__ static constexpr index_t GetNumOfLowerDimension()
-    {
-        return LowerDesc::GetNumOfDimension();
-    }
-    __host__ __device__ static constexpr index_t GetNumOfUpperDimension()
-    {
-        index_t ndim_up = 0;
-        static_for<0, NTransform, 1>{}([&](auto i) constexpr {
-            constexpr auto tmp = UpDimensionIds{}.At(i);
-            ndim_up += decltype(tmp)::Size();
-        });
-        return ndim_up;
-    }
-    static constexpr index_t NDimUp  = GetNumOfUpperDimension();
-    static constexpr index_t NDimLow = GetNumOfLowerDimension();
-    using UpperIndex = MultiIndex<NDimUp>;
-    using LowerIndex = MultiIndex<NDimLow>;
-    struct lambda_merge_sequences
-    {
-        template <typename... Xs>
-        __host__ __device__ constexpr auto operator()(Xs... xs) const
-        {
-            return merge_sequences(xs...);
-        }
-    };
-    struct lambda_merge_arrays
-    {
-        template <typename... Xs>
-        __host__ __device__ constexpr auto operator()(Xs... xs) const
-        {
-            return container_cat(xs...);
-        }
-    };
-    __host__
-        __device__ constexpr DynamicTransformedTensorDescriptor_v1(const LowerDesc& low_tensor_desc,
-                                                                   const Transforms& transforms)
-        : low_tensor_desc_{low_tensor_desc}, transforms_{transforms}
-    {
-        static_assert(NTransform == Transforms::Size() && NTransform == LowDimensionIds::Size() &&
-                          NTransform == UpDimensionIds::Size(),
-                      "wrong! # of transformations not the same");
-        // sanity check:
-        //   LowDimensionIds should include all low-dimensions,
-        //   UpDimensionIds should include all up-dimensions
-        using unsorted_up_dimension_ids =
-            decltype(unpack(lambda_merge_sequences{}, UpDimensionIds{}));
-        using sorted_up_dimension_ids =
-            typename sequence_sort<unsorted_up_dimension_ids, math::less<index_t>>::type;
-        static_assert(sorted_up_dimension_ids::Size() == NDimUp &&
-                          is_valid_sequence_map<sorted_up_dimension_ids>{},
-                      "wrong! UpDimensionIds is not configured correctly");
-        using unsorted_low_dimension_ids =
-            decltype(unpack(lambda_merge_sequences{}, LowDimensionIds{}));
-        using sorted_low_dimension_ids =
-            typename sequence_sort<unsorted_low_dimension_ids, math::less<index_t>>::type;
-        static_assert(sorted_low_dimension_ids::Size() == NDimLow &&
-                          is_valid_sequence_map<sorted_low_dimension_ids>{},
-                      "wrong! LowDimensionIds is not configured correctly");
-        // TODO: sanity check: while a up-dimension could be associated with
-        // multille
-        //   transformation, a low-dimension should be associated with only one
-        //   transformation
-        // TODO: sanity-check: GetLowerLengths of each transform should be
-        // consistent with lengths
-        //   of lower-tensor-descriptor
-    }
-    __host__ __device__ constexpr DynamicTransformedTensorDescriptor_v1()
-        : low_tensor_desc_{}, transforms_{}
-    {
-    }
-    __host__ __device__ static constexpr index_t GetNumOfDimension()
-    {
-        return GetNumOfUpperDimension();
-    }
-    __host__ __device__ constexpr auto GetUpperLengths() const
-    {
-        // sort upper-dimension-ids
-        constexpr auto unsorted_up_dimension_ids =
-            unpack(lambda_merge_sequences{}, UpDimensionIds{});
-        using sort_up_dimension_ids = sequence_unique_sort<decltype(unsorted_up_dimension_ids),
-                                                           math::less<index_t>,
-                                                           math::equal<index_t>>;
-        constexpr auto sorted2unsorted_map = typename sort_up_dimension_ids::sorted2unsorted_map{};
-        // sort upper-lengths
-        const auto tuple_of_up_lengths =
-            transform_tuples([](const auto& tran) constexpr { return tran.GetUpperLengths(); },
-                             transforms_);
-        const auto unsorted_up_lengths = unpack(lambda_merge_arrays{}, tuple_of_up_lengths);
-        const auto sorted_up_lengths =
-            container_reorder_given_new2old(unsorted_up_lengths, sorted2unsorted_map);
-        return sorted_up_lengths;
-    }
-    __host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
-    template <index_t IDim>
-    __host__ __device__ constexpr index_t GetLength(Number<IDim>) const
-    {
-        return GetLengths()[Number<IDim>{}];
-    }
-    __host__ __device__ constexpr index_t GetElementSize() const
-    {
-        return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
-    }
-    __host__ __device__ constexpr index_t GetElementSpace() const
-    {
-        return low_tensor_desc_.GetElementSpace();
-    }
-    __host__ __device__ constexpr auto GetLowerTensorDescriptor() const { return low_tensor_desc_; }
-    template <typename LowIdx, typename UpIdx>
-    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
-    {
-        static_for<0, NTransform, 1>{}([&](auto itran) constexpr {
-            const auto tran = transforms_.At(itran);
-            const auto idx_up_part = pick_container_element(idx_up, UpDimensionIds{}.At(itran));
-            auto idx_low_part      = pick_container_element(idx_low, LowDimensionIds{}.At(itran));
-            tran.CalculateLowerIndex(idx_low_part, idx_up_part);
-        });
-    }
-    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                                                     const UpIdxDiff& idx_up_diff,
-                                                     const LowIdx& idx_low_old,
-                                                     const UpIdx& idx_up_old) const
-    {
-        static_for<0, NTransform, 1>{}([&](auto itran) {
-            const auto tran = transforms_.At(itran);
-            const auto idx_up_diff_part =
-                pick_container_element(idx_up_diff, UpDimensionIds{}.At(itran));
-            const auto idx_up_old_part =
-                pick_container_element(idx_up_old, UpDimensionIds{}.At(itran));
-            const auto idx_low_old_part =
-                pick_container_element(idx_low_old, LowDimensionIds{}.At(itran));
-            auto idx_low_diff_part =
-                pick_container_element(idx_low_diff, LowDimensionIds{}.At(itran));
-            tran.CalculateLowerIndexDiff(
-                idx_low_diff_part, idx_up_diff_part, idx_low_old_part, idx_up_old_part);
-        });
-    }
-    template <typename UpIdx>
-    __host__ __device__ constexpr auto CalculateLowerIndex(const UpIdx& idx_up) const
-    {
-        LowerIndex idx_low;
-        CalculateLowerIndex(idx_low, idx_up);
-        return idx_low;
-    }
-    template <typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ constexpr auto CalculateLowerIndexDiff(const UpIdxDiff& idx_up_diff,
-                                                               const LowIdx& idx_low_old,
-                                                               const UpIdx& idx_up_old) const
-    {
-        LowerIndex idx_low_diff;
-        CalculateLowerIndexDiff(idx_low_diff, idx_up_diff, idx_low_old, idx_up_old);
-        return idx_low_diff;
-    }
-    __host__ __device__ constexpr index_t CalculateOffset(const UpperIndex& idx_up) const
-    {
-        return low_tensor_desc_.CalculateOffset(CalculateLowerIndex(idx_up));
-    }
-    __host__ __device__ constexpr bool IsUpperIndexValid(const UpperIndex& idx_up) const
-    {
-        bool flag = true;
-        static_for<0, NDimUp, 1>{}(
-            [&](auto i) { flag = flag && idx_up[i] >= 0 && idx_up[i] < GetLength(i); });
-        return flag;
-    }
-    __host__ __device__ constexpr bool
-    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) const
-    {
-        bool flag = true;
-        static_for<0, NTransform, 1>{}([&](auto itran) {
-            const auto tran = Transforms{}.At(itran);
-            // check a indtransformation if it does not always has a valid mapping
-            constexpr bool is_valid_up_always_mapped_to_valid_low =
-                decltype(tran)::IsValidUpperIndexAlwaysMappedToValidLowerIndex();
-            if constexpr(!is_valid_up_always_mapped_to_valid_low)
-            {
-                const auto up_dims_part = UpDimensionIds{}.At(itran);
-                const auto idx_up_part  = pick_container_element(idx_up, up_dims_part);
-                flag = flag && tran.IsValidUpperIndexMappedToValidLowerIndex(idx_up_part);
-            }
-        });
-        return flag;
-    }
-};
-} // namespace ck
-#endif
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -17,17 +17,13 @@ install(TARGETS host LIBRARY DESTINATION lib)
 if(DEVICE_BACKEND STREQUAL "AMD")
    set(CONV_SOURCE src/conv_driver.cpp)
    set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cpp)
-    set(COL2IM_SOURCE src/col2im_driver.cpp)
 elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
    set(CONV_SOURCE src/conv_driver.cu)
    set(CONV_BWD_DATA_SOURCE src/conv_bwd_data_driver.cu)
-    set(COL2IM_SOURCE src/col2im_driver.cu)
 endif()
 add_executable(conv_driver ${CONV_SOURCE}) 
 add_executable(conv_bwd_data_driver ${CONV_BWD_DATA_SOURCE}) 
-add_executable(col2im_driver ${COL2IM_SOURCE}) 
 target_link_libraries(conv_driver PRIVATE host)
 target_link_libraries(conv_bwd_data_driver PRIVATE host)
-target_link_libraries(col2im_driver PRIVATE host)
--- a/driver/include/device_col2im_eb_nchw.hpp
+++ b/driver/include/device_col2im_eb_nchw.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "gridwise_operation_wrapper.hpp"
-#include "gridwise_col2im_eb_nchw.hpp"
-template <typename T,
-          typename ColDesc,
-          typename ImgDesc,
-          typename FilterSizes,
-          typename OutputSizes,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename LeftPads,
-          typename RightPads>
-void device_col2im_eb_nchw(ColDesc,
-                           const Tensor<T>& col_eb,
-                           ImgDesc,
-                           Tensor<T>& img_nchw,
-                           FilterSizes,
-                           OutputSizes,
-                           ConvStrides,
-                           ConvDilations,
-                           LeftPads,
-                           RightPads,
-                           std::size_t nrepeat)
-{
-    using namespace ck;
-    constexpr auto col_eb_desc   = ColDesc{};
-    constexpr auto img_nchw_desc = ImgDesc{};
-    constexpr index_t N  = img_nchw_desc.GetLengths()[0];
-    constexpr index_t C  = img_nchw_desc.GetLengths()[1];
-    constexpr index_t Hi = img_nchw_desc.GetLengths()[2];
-    constexpr index_t Wi = img_nchw_desc.GetLengths()[3];
-    constexpr index_t E = col_eb_desc.GetLengths()[0];
-    constexpr index_t B = col_eb_desc.GetLengths()[1];
-    std::size_t data_sz = sizeof(T);
-    DeviceMem col_eb_device_buf(data_sz * col_eb.mDesc.GetElementSpace());
-    DeviceMem img_nchw_device_buf(data_sz * img_nchw.mDesc.GetElementSpace());
-    col_eb_device_buf.ToDevice(col_eb.mData.data());
-    img_nchw_device_buf.ToDevice(img_nchw.mData.data());
-#if 1
-    constexpr index_t BlockSize = 256;
-    constexpr index_t EPerBlock = 128;
-    constexpr index_t BPerBlock = 128;
-    using BlockCopySubLengths_E_B            = Sequence<8, 8>;
-    using BlockCopyClusterLengths_E_B        = Sequence<16, 16>;
-    using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
-    using BlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
-    using BlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
-    constexpr index_t BlockCopyDataPerAccess_B = 1;
-#endif
-    constexpr index_t GridSize =
-        ((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock);
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    constexpr auto gridwise_col2im = GridwiseCol2Im_eb_nchw<GridSize,
-                                                            BlockSize,
-                                                            T,
-                                                            ColDesc,
-                                                            ImgDesc,
-                                                            FilterSizes,
-                                                            OutputSizes,
-                                                            ConvStrides,
-                                                            ConvDilations,
-                                                            LeftPads,
-                                                            RightPads,
-                                                            EPerBlock,
-                                                            BPerBlock,
-                                                            BlockCopySubLengths_E_B,
-                                                            BlockCopyClusterLengths_E_B,
-                                                            BlockCopyThreadClusterArrangeOrder,
-                                                            BlockCopySrcAccessOrder,
-                                                            BlockCopyDstAccessOrder,
-                                                            BlockCopyDataPerAccess_B>{};
-    for(index_t i = 0; i < 1; ++i)
-    {
-        std::cout << "Start running " << nrepeat << " times..." << std::endl;
-        KernelTimer timer;
-        timer.Start();
-        for(index_t j = 0; j < nrepeat; ++j)
-        {
-            launch_kernel(run_gridwise_operation<decltype(gridwise_col2im),
-                                                 const T* const __restrict__,
-                                                 T* const __restrict__>,
-                          dim3(GridSize),
-                          dim3(BlockSize),
-                          0,
-                          0,
-                          const_cast<const T* const __restrict__>(
-                              static_cast<T*>(col_eb_device_buf.GetDeviceBuffer())),
-                          const_cast<T* const __restrict__>(
-                              static_cast<T*>(img_nchw_device_buf.GetDeviceBuffer())));
-        }
-        timer.End();
-        float ave_time = timer.GetElapsedTime() / nrepeat;
-        std::cout << "Average time : " << ave_time << " ms" << std::endl;
-    }
-    img_nchw_device_buf.FromDevice(img_nchw.mData.data());
-}
--- a/driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
+++ b/driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "gridwise_operation_wrapper.hpp"
-#include "gridwise_dynamic_col2im_gemmkgemmn_nchw.hpp"
-template <typename T,
-          typename ColDesc,
-          typename ImgDesc,
-          typename FilterSizes,
-          typename OutputSizes,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename InLeftPads,
-          typename InRightPads>
-void device_dynamic_col2im_gemmkgemmn_nchw(ColDesc,
-                                           const Tensor<T>& col_gemmk_gemmn,
-                                           ImgDesc,
-                                           Tensor<T>& img_n_c_hi_wi,
-                                           FilterSizes,
-                                           OutputSizes,
-                                           ConvStrides,
-                                           ConvDilations,
-                                           InLeftPads,
-                                           InRightPads,
-                                           std::size_t nrepeat)
-{
-    using namespace ck;
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-    std::size_t data_sz = sizeof(T);
-    DeviceMem col_gemmk_gemmn_device_buf(data_sz * col_gemmk_gemmn.mDesc.GetElementSpace());
-    DeviceMem img_n_c_hi_wi_device_buf(data_sz * img_n_c_hi_wi.mDesc.GetElementSpace());
-    col_gemmk_gemmn_device_buf.ToDevice(col_gemmk_gemmn.mData.data());
-    img_n_c_hi_wi_device_buf.ToDevice(img_n_c_hi_wi.mData.data());
-    const auto col_gemmk_gemmn_desc = make_dynamic_naive_tensor_descriptor<2>(
-        to_multi_index(ColDesc::GetLengths()), to_multi_index(ColDesc::GetStrides()));
-    const auto img_n_c_hi_wi_desc = make_dynamic_naive_tensor_descriptor<4>(
-        to_multi_index(ImgDesc::GetLengths()), to_multi_index(ImgDesc::GetStrides()));
-    const auto filter_sizes   = to_multi_index(FilterSizes{});
-    const auto out_sizes      = to_multi_index(OutputSizes{});
-    const auto conv_strides   = to_multi_index(ConvStrides{});
-    const auto conv_dilations = to_multi_index(ConvDilations{});
-    const auto in_left_pads   = to_multi_index(InLeftPads{});
-    const auto in_right_pads  = to_multi_index(InRightPads{});
-    const auto img_gemmk_gemmn_desc = map_img_into_col(img_n_c_hi_wi_desc,
-                                                       out_sizes,
-                                                       filter_sizes,
-                                                       conv_strides,
-                                                       conv_dilations,
-                                                       in_left_pads,
-                                                       in_right_pads);
-    const index_t GemmN = col_gemmk_gemmn_desc.GetLength(I1);
-#if 1
-    constexpr index_t BlockSize = 256;
-    constexpr index_t GemmKPerBlock = 8;
-    constexpr index_t GemmNPerBlock = 128;
-    using BlockCopySubLengths_GemmK_GemmN     = Sequence<1, 8>;
-    using BlockCopyClusterLengths_GemmK_GemmN = Sequence<8, 16>;
-    using BlockCopyThreadClusterArrangeOrder  = Sequence<0, 1>; // [GemmK, GemmN]
-    using BlockCopySrcAccessOrder             = Sequence<0, 1>; // [GemmK, GemmN]
-    using BlockCopyDstAccessOrder             = Sequence<0, 1>; // [GemmK, GemmN]
-    constexpr index_t BlockCopyDataPerAccess_GemmN = 1;
-#endif
-    const index_t GridSize = GemmN / GemmNPerBlock;
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-    constexpr auto gridwise_col2im =
-        GridwiseDynamicCol2Im_gemmkgemmn_nchw<BlockSize,
-                                              GemmKPerBlock,
-                                              GemmNPerBlock,
-                                              BlockCopySubLengths_GemmK_GemmN,
-                                              BlockCopyClusterLengths_GemmK_GemmN,
-                                              BlockCopyThreadClusterArrangeOrder,
-                                              BlockCopySrcAccessOrder,
-                                              BlockCopyDstAccessOrder,
-                                              BlockCopyDataPerAccess_GemmN>{};
-    for(index_t i = 0; i < 1; ++i)
-    {
-        std::cout << "Start running " << nrepeat << " times..." << std::endl;
-        KernelTimer timer;
-        timer.Start();
-        for(index_t j = 0; j < nrepeat; ++j)
-        {
-            launch_kernel(run_gridwise_operation<decltype(gridwise_col2im),
-                                                 const T* const __restrict__,
-                                                 T* const __restrict__,
-                                                 decltype(col_gemmk_gemmn_desc),
-                                                 decltype(img_gemmk_gemmn_desc)>,
-                          dim3(GridSize),
-                          dim3(BlockSize),
-                          0,
-                          0,
-                          const_cast<const T* const __restrict__>(
-                              static_cast<T*>(col_gemmk_gemmn_device_buf.GetDeviceBuffer())),
-                          const_cast<T* const __restrict__>(
-                              static_cast<T*>(img_n_c_hi_wi_device_buf.GetDeviceBuffer())),
-                          col_gemmk_gemmn_desc,
-                          img_gemmk_gemmn_desc);
-        }
-        timer.End();
-        float ave_time = timer.GetElapsedTime() / nrepeat;
-        std::cout << "Average time : " << ave_time << " ms" << std::endl;
-    }
-    img_n_c_hi_wi_device_buf.FromDevice(img_n_c_hi_wi.mData.data());
-}
--- a/driver/include/host_col2im.hpp
+++ b/driver/include/host_col2im.hpp
-#pragma once
-#include "host_tensor.hpp"
-template <typename T,
-          typename FilterSizes,
-          typename OutputSizes,
-          typename ConvStrides,
-          typename ConvDilations,
-          typename LeftPads,
-          typename RightPads>
-void host_col2im(const Tensor<T>& in_eb,
-                 Tensor<T>& in_nchw,
-                 FilterSizes,
-                 OutputSizes,
-                 ConvStrides,
-                 ConvDilations,
-                 LeftPads,
-                 RightPads)
-{
-    using namespace ck;
-    int N  = in_nchw.mDesc.GetLengths()[0];
-    int C  = in_nchw.mDesc.GetLengths()[1];
-    int Hi = in_nchw.mDesc.GetLengths()[2];
-    int Wi = in_nchw.mDesc.GetLengths()[3];
-    int Y = FilterSizes{}[0];
-    int X = FilterSizes{}[1];
-    int Ho = OutputSizes{}[0];
-    int Wo = OutputSizes{}[1];
-    auto f = [&](auto n, auto c, auto hi, auto wi) {
-        double v = 0;
-        for(int y = 0; y < Y; ++y)
-        {
-            int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
-            if(h_tmp % ConvStrides{}[0] == 0)
-            {
-                int ho = h_tmp / ConvStrides{}[0];
-                if(ho >= 0 && ho < Ho)
-                {
-                    for(int x = 0; x < X; ++x)
-                    {
-                        int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
-                        if(w_tmp % ConvStrides{}[1] == 0)
-                        {
-                            int wo = w_tmp / ConvStrides{}[1];
-                            if(wo >= 0 && wo < Wo && w_tmp % ConvStrides{}[1] == 0)
-                            {
-                                int e = c * (Y * X) + y * X + x;
-                                int b = n * (Ho * Wo) + ho * Wo + wo;
-                                v += in_eb(e, b);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        in_nchw(n, c, hi, wi) = v;
-    };
-    auto f_par = make_ParallelTensorFunctor(f,
-                                            in_nchw.mDesc.GetLengths()[0],
-                                            in_nchw.mDesc.GetLengths()[1],
-                                            in_nchw.mDesc.GetLengths()[2],
-                                            in_nchw.mDesc.GetLengths()[3]);
-    f_par(std::thread::hardware_concurrency());
-}
--- a/driver/src/col2im_driver.cpp
+++ b/driver/src/col2im_driver.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor_generator.hpp"
-#include "conv_common.hpp"
-#include "host_conv.hpp"
-#include "device_tensor.hpp"
-#include "host_col2im.hpp"
-#include "device_col2im_eb_nchw.hpp"
-#include "device_dynamic_col2im_gemmkgemmn_nchw.hpp"
-int main(int argc, char* argv[])
-{
-    using namespace ck;
-#if 1
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
-#elif 0
-    // 3x3, 71x71
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 192;
-    constexpr index_t HI = 71;
-    constexpr index_t WI = 71;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
-#elif 1
-    // 1x1, 8x8
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 1536;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1, 73x73
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 160;
-    constexpr index_t HI = 73;
-    constexpr index_t WI = 73;
-    constexpr index_t K  = 64;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 3x3, 35x35
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 96;
-    constexpr index_t HI = 35;
-    constexpr index_t WI = 35;
-    constexpr index_t K  = 96;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
-#elif 0
-    // 3x3, 71x71
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 192;
-    constexpr index_t HI = 71;
-    constexpr index_t WI = 71;
-    constexpr index_t K  = 192;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
-#elif 0
-    // 7x1, 17x17
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 7;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<3, 0>;
-    using RightPads = Sequence<3, 0>;
-#elif 1
-    // 1x7, 17x17
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 7;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 3>;
-    using RightPads = Sequence<0, 3>;
-#elif 0
-    // 3x3, 299x299 stride=2
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 3;
-    constexpr index_t HI = 299;
-    constexpr index_t WI = 299;
-    constexpr index_t K  = 32;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 3x3, 147x147
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 32;
-    constexpr index_t HI = 147;
-    constexpr index_t WI = 147;
-    constexpr index_t K  = 64;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
-#elif 0
-    // 3x3, 149x149
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 32;
-    constexpr index_t HI = 149;
-    constexpr index_t WI = 149;
-    constexpr index_t K  = 32;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 3x3, 17x17, stride 2
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 192;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 192;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1, 35x35
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 384;
-    constexpr index_t HI = 35;
-    constexpr index_t WI = 35;
-    constexpr index_t K  = 96;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 3x3, 35x35, stride 2
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 35;
-    constexpr index_t WI = 35;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x3, 8x8
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 384;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 448;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 1>;
-    using RightPads = Sequence<0, 1>;
-#elif 0
-    // 3x1, 8x8
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 448;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 512;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<1, 0>;
-    using RightPads = Sequence<1, 0>;
-#elif 0
-    // 3x3, 147x147
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 64;
-    constexpr index_t HI = 147;
-    constexpr index_t WI = 147;
-    constexpr index_t K  = 96;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 7x1, 73x73
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 64;
-    constexpr index_t HI = 73;
-    constexpr index_t WI = 73;
-    constexpr index_t K  = 64;
-    constexpr index_t Y  = 7;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<3, 0>;
-    using RightPads = Sequence<3, 0>;
-#elif 0
-    // 3x3, 73x73
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 64;
-    constexpr index_t HI = 73;
-    constexpr index_t WI = 73;
-    constexpr index_t K  = 96;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1, 14x14, stride 2
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 1024;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 2048;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1, 14x14
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 1024;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1, 14x14, stride 2
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 1024;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 512;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 1
-    // 3x3, 28x28
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 192;
-    constexpr index_t HI = 28;
-    constexpr index_t WI = 28;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
-#elif 0
-    // 3x3, 14x14
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
-#elif 0
-    // 1x1, 56x56, stride 2
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 56;
-    constexpr index_t WI = 56;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 7x7, 230x230 stride=2
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 3;
-    constexpr index_t HI = 230;
-    constexpr index_t WI = 230;
-    constexpr index_t K  = 64;
-    constexpr index_t Y  = 7;
-    constexpr index_t X  = 7;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1, 28x28, stride = 2
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 512;
-    constexpr index_t HI = 28;
-    constexpr index_t WI = 28;
-    constexpr index_t K  = 1024;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1, 28x28, stride 2
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 512;
-    constexpr index_t HI = 28;
-    constexpr index_t WI = 28;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1, 7x7
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 512;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
-    constexpr index_t K  = 2048;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 3x3, 7x7
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 512;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
-    constexpr index_t K  = 512;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
-#elif 0
-    // 1x1, 56x56
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 64;
-    constexpr index_t HI = 56;
-    constexpr index_t WI = 56;
-    constexpr index_t K  = 64;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 3x3, 56x56
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 64;
-    constexpr index_t HI = 56;
-    constexpr index_t WI = 56;
-    constexpr index_t K  = 64;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<1, 1>;
-    using RightPads = Sequence<1, 1>;
-#endif
-    constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
-    constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
-    constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
-        img_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
-    constexpr index_t HO = out_nkhw_desc.GetLengths()[2];
-    constexpr index_t WO = out_nkhw_desc.GetLengths()[3];
-    constexpr auto col_eb_desc =
-        make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{});
-    using FilterSizes = Sequence<Y, X>;
-    using OutputSizes = Sequence<HO, WO>;
-    ostream_tensor_descriptor(col_eb_desc, std::cout << "col_eb_desc: ");
-    ostream_tensor_descriptor(img_nchw_desc, std::cout << "img_nchw_desc: ");
-    print_array("FilterSizes", FilterSizes{});
-    print_array("OutputSizes", OutputSizes{});
-    print_array("LeftPads", LeftPads{});
-    print_array("LeftPads", LeftPads{});
-    print_array("RightPads", RightPads{});
-    print_array("ConvStrides", ConvStrides{});
-    print_array("ConvDilations", ConvDilations{});
-    Tensor<float> col_eb(make_HostTensorDescriptor(col_eb_desc));
-    Tensor<float> img_nchw_host(make_HostTensorDescriptor(img_nchw_desc));
-    Tensor<float> img_nchw_device(make_HostTensorDescriptor(img_nchw_desc));
-    std::size_t num_thread = std::thread::hardware_concurrency();
-    if(argc != 3)
-    {
-        printf("arg1: do_verification, arg2: nrepeat\n");
-        exit(1);
-    }
-    bool do_verification = atoi(argv[1]);
-    std::size_t nrepeat  = atoi(argv[2]);
-    if(do_verification)
-    {
-#if 0
-        col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-#else
-        col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-#endif
-    }
-#if 0
-    device_col2im_eb_nchw(col_eb_desc,
-                          col_eb,
-                          img_nchw_desc,
-                          img_nchw_device,
-                          FilterSizes{},
-                          OutputSizes{},
-                          ConvStrides{},
-                          ConvDilations{},
-                          LeftPads{},
-                          RightPads{},
-                          nrepeat);
-#elif 1
-    device_dynamic_col2im_gemmkgemmn_nchw(col_eb_desc,
-                                          col_eb,
-                                          img_nchw_desc,
-                                          img_nchw_device,
-                                          FilterSizes{},
-                                          OutputSizes{},
-                                          ConvStrides{},
-                                          ConvDilations{},
-                                          LeftPads{},
-                                          RightPads{},
-                                          nrepeat);
-#endif
-    if(do_verification)
-    {
-        host_col2im(col_eb,
-                    img_nchw_host,
-                    FilterSizes{},
-                    OutputSizes{},
-                    ConvStrides{},
-                    ConvDilations{},
-                    LeftPads{},
-                    RightPads{});
-        check_error(img_nchw_host, img_nchw_device);
-#if 0
-        LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
-        LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
-        LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
-#endif
-    }
-}