small test case for hip compiler

1e3d69b9 · Chao Liu · f0716f5b · f0716f5b · f0716f5b · f0716f5b
Commit 1e3d69b9 authored Jun 28, 2019 by Chao Liu
12 changed files
--- a/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_direct_convolution.hpp
-#ifndef CK_THREADWISE_DIRECT_CONVOLUTION_HPP
-#define CK_THREADWISE_DIRECT_CONVOLUTION_HPP
-
-#include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "threadwise_tensor_slice_copy.hpp"
-
-namespace ck {
-
-// optimized for scenario if p_in, p_wei, p_out are in register
-template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
-__device__ void threadwise_direct_convolution_1(InDesc,
-                                                TInWei* const __restrict__ p_in,
-                                                WeiDesc,
-                                                TInWei* const __restrict__ p_wei,
-                                                OutDesc,
-                                                TOut* __restrict__ p_out)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_desc  = InDesc{};
-    constexpr auto wei_desc = WeiDesc{};
-    constexpr auto out_desc = OutDesc{};
-
-#if 0
-    if(blockIdx.x == 0 && get_thread_local_1d_id() == 0)
-    {
-        print_ConstantTensorDescriptor(in_desc, "threadwise_direct_convolution: in_desc: ");
-        print_ConstantTensorDescriptor(wei_desc, "threadwise_direct_convolution: wei_desc: ");
-        print_ConstantTensorDescriptor(out_desc, "threadwise_direct_convolution: out_desc: ");
-    }
-#endif
-
-    for(index_t n = 0; n < out_desc.GetLength(I0); ++n)
-    {
-        for(index_t k = 0; k < out_desc.GetLength(I1); ++k)
-        {
-            for(index_t ho = 0; ho < out_desc.GetLength(I2); ++ho)
-            {
-                for(index_t wo = 0; wo < out_desc.GetLength(I3); ++wo)
-                {
-                    for(index_t c = 0; c < wei_desc.GetLength(I1); ++c)
-                    {
-                        for(index_t y = 0; y < wei_desc.GetLength(I2); ++y)
-                        {
-                            for(index_t x = 0; x < wei_desc.GetLength(I3); ++x)
-                            {
-                                const index_t hi = ho + y;
-                                const index_t wi = wo + x;
-
-                                const index_t in_index =
-                                    in_desc.GetOffsetFromMultiIndex(n, c, hi, wi);
-
-                                const index_t wei_index =
-                                    wei_desc.GetOffsetFromMultiIndex(k, c, y, x);
-
-                                const index_t out_index =
-                                    out_desc.GetOffsetFromMultiIndex(n, k, ho, wo);
-
-                                fused_multiply_accumulate(
-                                    p_out[out_index], p_wei[wei_index], p_in[in_index]);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
-// Copy in and wei into register before doing convolution
-template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
-__device__ void threadwise_direct_convolution_2(InDesc,
-                                                TInWei* const __restrict__ p_in,
-                                                WeiDesc,
-                                                TInWei* const __restrict__ p_wei,
-                                                OutDesc,
-                                                TOut* __restrict__ p_out)
-{
-    constexpr auto in_desc  = InDesc{};
-    constexpr auto wei_desc = WeiDesc{};
-    constexpr auto out_desc = OutDesc{};
-
-    constexpr auto in_reg_desc  = make_ConstantTensorDescriptor_packed(in_desc.GetLengths());
-    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor_packed(wei_desc.GetLengths());
-
-    // register
-    TInWei p_in_reg[in_reg_desc.GetElementSpace()];
-    TInWei p_wei_reg[wei_reg_desc.GetElementSpace()];
-
-    // copy input tensor into register
-    threadwise_tensor_slice_copy(
-        in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc.GetLengths(), Number<1>{});
-
-    // copy input tensor into register
-    threadwise_tensor_slice_copy(
-        wei_desc, p_wei, wei_reg_desc, p_wei_reg, wei_reg_desc.GetLengths(), Number<1>{});
-
-    // do convolution
-    threadwise_direct_convolution_1(
-        in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
-}
-
-// optimized for scenario where p_in and p_wei are in LDS, p_out is in register
-// break down a non-1x1 convolution into a sequence of 1x1 convolutions,
-// load 1x1 weight into register, and do 1x1 convolution in register.
-template <class Data, class InDesc, class WeiDesc, class OutDesc>
-__device__ void threadwise_direct_convolution_3(InDesc,
-                                                Data* const __restrict__ p_in,
-                                                WeiDesc,
-                                                Data* const __restrict__ p_wei,
-                                                OutDesc,
-                                                Data* __restrict__ p_out)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_desc  = InDesc{};
-    constexpr auto wei_desc = WeiDesc{};
-    constexpr auto out_desc = OutDesc{};
-
-    constexpr auto in_reg_desc = make_ConstantTensorDescriptor(Sequence<in_desc.GetLength(I0),
-                                                                        in_desc.GetLength(I1),
-                                                                        out_desc.GetLength(I2),
-                                                                        out_desc.GetLength(I3)>{});
-
-    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
-        Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
-
-    Data p_in_reg[in_reg_desc.GetElementSpace()];
-    Data p_wei_reg[wei_reg_desc.GetElementSpace()];
-
-    constexpr index_t in_w_new_read = 1;
-
-    constexpr auto in_desc_reg_new_read =
-        make_ConstantTensorDescriptor(Sequence<in_reg_desc.GetLength(I0),
-                                               in_reg_desc.GetLength(I1),
-                                               in_reg_desc.GetLength(I2),
-                                               in_w_new_read>{});
-
-#if 0
-    // this verison reused old input data in register, and read new data from LDS
-    // loop over vertical direction
-    for(index_t y = 0; y < wei_desc.GetLength(I2); ++y)
-    {
-        // read first input
-        threadwise_4d_tensor_copy(in_desc,
-                                  p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, 0),
-                                  in_reg_desc,
-                                  p_in_reg,
-                                  in_reg_desc.GetLengths());
-
-        // read first 1x1 weight
-        threadwise_4d_tensor_copy(wei_desc,
-                                  p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, 0),
-                                  wei_reg_desc,
-                                  p_wei_reg,
-                                  wei_reg_desc.GetLengths());
-
-        // do first 1x1 conv
-        threadwise_direct_convolution_1(
-            in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
-
-        // loop over horizontal direction
-        for(index_t x = 1; x < wei_desc.GetLength(I3); ++x)
-        {
-            // read new weight
-            threadwise_4d_tensor_copy(wei_desc,
-                                      p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, x),
-                                      wei_reg_desc,
-                                      p_wei_reg,
-                                      wei_reg_desc.GetLengths());
-
-            // shift old input to the left
-            threadwise_4d_tensor_shift_down(in_reg_desc, p_in_reg, I3, Number<in_w_new_read>{});
-
-            // read new input
-            threadwise_4d_tensor_copy(
-                in_desc,
-                p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, x + in_reg_desc.GetLength(I3) - 1),
-                in_reg_desc,
-                p_in_reg +
-                    in_reg_desc.GetOffsetFromMultiIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
-                in_desc_reg_new_read.GetLengths());
-
-            // do 1x1 conv
-            threadwise_direct_convolution_1(
-                in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
-        }
-    }
-#elif 1
-    // this version read all input from LDS when filter moves
-    // loop over vertical direction
-    for(index_t y = 0; y < wei_desc.GetLength(I2); ++y)
-    {
-        // loop over horizontal direction
-        for(index_t x = 0; x < wei_desc.GetLength(I3); ++x)
-        {
-            // read new weight
-            threadwise_4d_tensor_copy(wei_desc,
-                                      p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, x),
-                                      wei_reg_desc,
-                                      p_wei_reg,
-                                      wei_reg_desc.GetLengths());
-
-            // read new input
-            threadwise_4d_tensor_copy(in_desc,
-                                      p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, x),
-                                      in_reg_desc,
-                                      p_in_reg,
-                                      in_reg_desc.GetLengths());
-
-            // do 1x1 conv
-            threadwise_direct_convolution_1(
-                in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
-        }
-    }
-#endif
-}
-
-} // namespace ck
-#endif
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_op.hpp
-#ifndef CK_THREADWISE_GENERIC_TENSOR_OP_HPP
-#define CK_THREADWISE_GENERIC_TENSOR_OP_HPP
-
-#include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
-
-namespace ck {
-template <class Float, class TDesc>
-__device__ void threadwise_generic_tensor_set_zero(TDesc, Float* __restrict__ p)
-{
-    static_ford<decltype(TDesc::GetLengths())>{}([&](auto multi_id) {
-        constexpr index_t offset = TDesc::GetOffsetFromMultiIndex(multi_id);
-
-        p[offset] = static_cast<Float>(0);
-    });
-}
-
-} // namespace ck
-#endif
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_copy.hpp
-#ifndef CK_THREADWISE_TENSOR_SLICE_COPY_HPP
-#define CK_THREADWISE_TENSOR_SLICE_COPY_HPP
-
-#include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
-
-namespace ck {
-
-// need to assume src and dst is aligned
-template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, index_t DataPerRead>
-__device__ void threadwise_tensor_slice_copy(SrcDesc,
-                                             const Float* __restrict__ p_src,
-                                             DstDesc,
-                                             Float* __restrict__ p_dst,
-                                             SrcOpLengths,
-                                             Number<DataPerRead>)
-{
-    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
-
-    constexpr index_t nDim = SrcOpLengths::GetSize();
-
-    static_assert(SrcDesc{}.GetNumOfDimension() == nDim && DstDesc{}.GetNumOfDimension() == nDim,
-                  "wrong! dimension not consistent");
-
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-    constexpr auto ref_desc = make_ConstantTensorDescriptor_packed(SrcOpLengths{});
-
-#if 0
-    if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-    {
-        print_ConstantTensorDescriptor(src_desc, "src_desc");
-        print_ConstantTensorDescriptor(dst_desc, "dst_desc");
-        print_ConstantTensorDescriptor(ref_desc, "ref_desc");
-    }
-#endif
-
-    static_assert(DataPerRead == 1 || (SrcDesc{}.GetStride(Number<nDim - 1>{}) == 1 &&
-                                       DstDesc{}.GetStride(Number<nDim - 1>{}) == 1),
-                  "wrong! only support stride[nDim-1] == 1!\n");
-
-    static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
-                  "wrong! only support DataPerRead == 1, 2 or 4!\n");
-
-    static_assert(
-        SrcDesc{}.GetStride(Number<nDim - 2>{}) % DataPerRead == 0 &&
-            DstDesc{}.GetStride(Number<nDim - 2>{}) % DataPerRead == 0,
-        "wrong! src and dst stride[nDim-2] should be multiple of DataPerRead to keep alignment");
-
-    constexpr index_t L_Back = SrcOpLengths{}.Back();
-
-    static_assert(L_Back % DataPerRead == 0,
-                  "wrong! lengths[nDim-1] should be evenly divided by DataPerRead");
-
-    constexpr index_t nRead = L_Back / DataPerRead;
-
-    static_ford<decltype(ref_desc.GetLengths().PopBack())>{}([=](auto Ids) {
-        static_for<0, nRead, 1>{}([&](auto IRead) {
-            constexpr auto multi_id = decltype(Ids){}.PushBack(Number<IRead * DataPerRead>{});
-
-            const index_t src_index = src_desc.GetOffsetFromMultiIndex(multi_id);
-
-            const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(multi_id);
-
-            *(reinterpret_cast<vector_t*>(&p_dst[dst_index])) =
-                *(reinterpret_cast<const vector_t*>(&p_src[src_index]));
-        });
-    });
-}
-
-// access in order of src
-template <class SrcData,
-          class DstData,
-          class SrcDesc,
-          class DstDesc,
-          class SrcOpLengths,
-          class MapDst2Src>
-__device__ void
-threadwise_tensor_slice_copy_reorder_given_dst2src_v1(SrcDesc,
-                                                      const SrcData* __restrict__ p_src,
-                                                      DstDesc,
-                                                      DstData* __restrict__ p_dst,
-                                                      SrcOpLengths,
-                                                      MapDst2Src)
-{
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-
-    ford<SrcOpLengths>{}([&](auto src_multi_id) {
-        const auto dst_multi_id = reorder_array_given_new2old(src_multi_id, MapDst2Src{});
-
-        const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id);
-
-        const index_t src_index = src_desc.GetOffsetFromMultiIndex(src_multi_id);
-
-        p_dst[dst_index] = p_src[src_index];
-    });
-}
-
-// access in order of dst
-template <class SrcData,
-          class DstData,
-          class SrcDesc,
-          class DstDesc,
-          class SrcOpLengths,
-          class MapDst2Src>
-__device__ void
-threadwise_tensor_slice_copy_reorder_given_dst2src_v2(SrcDesc,
-                                                      const SrcData* __restrict__ p_src,
-                                                      DstDesc,
-                                                      DstData* __restrict__ p_dst,
-                                                      SrcOpLengths,
-                                                      MapDst2Src)
-{
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-
-    constexpr auto dst_op_lengths = SrcOpLengths{}.ReorderGivenNew2Old(MapDst2Src{});
-
-    ford<decltype(dst_op_lengths)>{}([&](auto dst_multi_id) {
-        const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
-
-        const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id);
-
-        const index_t src_index = src_desc.GetOffsetFromMultiIndex(src_multi_id);
-
-        p_dst[dst_index] = p_src[src_index];
-    });
-}
-
-// access in order of dst
-// manually pack data into vector before write
-template <class Float,
-          class SrcDesc,
-          class DstDesc,
-          class SrcOpLengths,
-          class MapDst2Src,
-          index_t DstDataPerWrite>
-__device__ void
-threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
-                                                      const Float* __restrict__ p_src,
-                                                      DstDesc,
-                                                      Float* __restrict__ p_dst,
-                                                      SrcOpLengths,
-                                                      MapDst2Src,
-                                                      Number<DstDataPerWrite>)
-{
-    using vector_t = typename vector_type<Float, DstDataPerWrite>::MemoryType;
-
-    constexpr index_t nDim = SrcOpLengths::GetSize();
-
-    static_assert(DstDataPerWrite == 1 || DstDesc{}.GetStride(Number<nDim - 1>{}) == 1,
-                  "wrong! only support dst.stride[nDim-1] == 1, if DstDataPerWrite != 1");
-
-    static_assert(DstDataPerWrite == 1 || DstDataPerWrite == 2 || DstDataPerWrite == 4,
-                  "wrong! only support DstDataPerWrite == 1, 2 or 4");
-
-    static_assert(
-        DstDesc{}.GetStride(Number<nDim - 2>{}) % DstDataPerWrite == 0,
-        "wrong! dst.stride[nDim-2] should be multiple of DstDataPerWrite to keep alignment");
-
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-
-    constexpr auto dst_op_lengths = SrcOpLengths{}.ReorderGivenNew2Old(MapDst2Src{});
-
-    constexpr index_t L_Dst_Back = dst_op_lengths.Back();
-
-    static_assert(L_Dst_Back % DstDataPerWrite == 0,
-                  "wrong! dst.lengths[nDim-1] should be evenly divided by DstDataPerWrite");
-
-    constexpr index_t nWrite = L_Dst_Back / DstDataPerWrite;
-
-    ford<decltype(dst_op_lengths.PopBack())>{}([&](auto ids) {
-        static_for<0, nWrite, 1>{}([&](auto IWrite) {
-            vector_t dst_vec_data;
-
-            // pack data
-            static_for<0, DstDataPerWrite, 1>{}([&](auto IDstData) {
-                const auto dst_multi_id = ids.PushBack(IWrite * DstDataPerWrite + IDstData);
-
-                const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{});
-
-                const index_t src_index = src_desc.GetOffsetFromMultiIndex(src_multi_id);
-
-                vector_type<Float, DstDataPerWrite>::SetScalar(
-                    dst_vec_data, p_src[src_index], IDstData);
-            });
-
-            // write data
-            const auto dst_multi_id = ids.PushBack(IWrite * DstDataPerWrite);
-
-            const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id);
-
-            *(reinterpret_cast<vector_t*>(&p_dst[dst_index])) = dst_vec_data;
-        });
-    });
-}
-
-} // namespace ck
-#endif
--- a/driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
-
-using namespace ck;
-
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
-                                                 const Tensor<T>& in,
-                                                 WeiDesc,
-                                                 const Tensor<T>& wei,
-                                                 OutDesc,
-                                                 Tensor<T>& out,
-                                                 index_t nrepeat)
-{
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
-
-    int num_thread = std::thread::hardware_concurrency();
-
-    in_device_buf.ToDevice(in.mData.data());
-    wei_device_buf.ToDevice(wei.mData.data());
-    out_device_buf.ToDevice(out.mData.data());
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_desc  = InDesc{};
-    constexpr auto wei_desc = WeiDesc{};
-    constexpr auto out_desc = OutDesc{};
-
-#if 1
-    // 3x3, 34x34, 128 thread
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 32;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 32;
-
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 4;
-    constexpr index_t CPerThread  = 2;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t InBlockCopyDataPerRead  = 1;
-    constexpr index_t WeiBlockCopyDataPerRead = 1;
-
-    constexpr index_t BlockSize = 128;
-#endif
-
-    constexpr index_t GridSize =
-        (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
-        (out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        using gridwise_conv = GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw<GridSize,
-                                                                          BlockSize,
-                                                                          T,
-                                                                          InDesc,
-                                                                          WeiDesc,
-                                                                          OutDesc,
-                                                                          NPerBlock,
-                                                                          KPerBlock,
-                                                                          CPerBlock,
-                                                                          HoPerBlock,
-                                                                          WoPerBlock,
-                                                                          NPerThread,
-                                                                          KPerThread,
-                                                                          CPerThread,
-                                                                          HoPerThread,
-                                                                          WoPerThread,
-                                                                          InBlockCopyDataPerRead,
-                                                                          WeiBlockCopyDataPerRead>;
-        float time = launch_kernel(run_gridwise_convolution_kernel<gridwise_conv, T>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   static_cast<T*>(in_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms\n", time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-
-    out_device_buf.FromDevice(out.mData.data());
-}
--- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
-
-using namespace ck;
-
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
-                                                        const Tensor<T>& in_nchw,
-                                                        WeiDesc,
-                                                        const Tensor<T>& wei_kcyx,
-                                                        OutDesc,
-                                                        Tensor<T>& out_nkhw,
-                                                        index_t nrepeat)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
-
-    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
-    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
-
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-
-    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
-    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
-    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
-    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
-
-    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
-    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
-
-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
-
-    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
-        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
-        std::thread::hardware_concurrency());
-
-    // reorder input
-    auto in_chwn_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Hi, Wi, N>{});
-    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
-
-    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
-
-    auto f_reorder_nchw2chwn = [&](auto n, auto c, auto hi, auto wi) {
-        in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_nchw2chwn, N, C, Hi, Wi)(
-        std::thread::hardware_concurrency());
-
-    // output
-    auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
-    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
-
-    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
-
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_chwn_device_buf(data_sz * in_chwn.mDesc.GetElementSpace());
-    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
-    DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace());
-
-    in_chwn_device_buf.ToDevice(in_chwn.mData.data());
-    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
-    out_khwn_device_buf.ToDevice(out_khwn.mData.data());
-
-#if 0
-    // for 3x3, 34x34, v1r1, Pascal
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 2;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopyClusterLengths_CHWN       = Sequence<4, 4, 2, 4>;
-    constexpr index_t InBlockCopyDataPerRead_N = 4;
-
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 0
-    // for 3x3, 34x34, v1r2, Pascal, in-block-copy1
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 4;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 8;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 2;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopyClusterLengths_CHWN       = Sequence<0, 0, 0, 0>; // not used
-    constexpr index_t InBlockCopyDataPerRead_N = 4;
-
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 0
-    // for 3x3, 34x34, v1r3, Pascal
-    // for 3x3, 28x28, v1r3, Pascal
-    // for 3x3, 14x14, v1r3, Pascal
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopyClusterLengths_CHWN       = Sequence<8, 2, 2, 4>;
-    constexpr index_t InBlockCopyDataPerRead_N = 4;
-
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 0
-    // for 3x3, 34x34, v1r3, Pascal, bad
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 1;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 32;
-
-    constexpr index_t NPerThread  = 1;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 8;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopyClusterLengths_CHWN       = Sequence<2, 2, 32, 1>;
-    constexpr index_t InBlockCopyDataPerRead_N = 1;
-
-    constexpr index_t WeiBlockCopyDataPerRead_K = 2;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 1;
-#elif 0
-    // for 3x3, 34x34, v1r1, Vega 20
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopyClusterLengths_CHWN       = Sequence<4, 4, 2, 8>;
-    constexpr index_t InBlockCopyDataPerRead_N = 2;
-
-    constexpr index_t WeiBlockCopyDataPerRead_K = 2;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 4;
-#elif 1
-    // for 3x3, 34x34, v1r3, Vega 20
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopyClusterLengths_CHWN       = Sequence<8, 2, 4, 4>;
-    constexpr index_t InBlockCopyDataPerRead_N = 4;
-
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 4;
-#elif 0
-    // for 3x3, 56x56, v1r1, Pascal
-    constexpr index_t NPerBlock  = 32;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t InBlockCopy_ThreadPerDimC = 1;
-    constexpr index_t InBlockCopy_ThreadPerDimH = 4;
-    constexpr index_t InBlockCopy_ThreadPerDimW = 4;
-    constexpr index_t InBlockCopy_ThreadPerDimN = 8;
-    constexpr index_t InBlockCopyDataPerRead_N  = 4;
-
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 2;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 3x3, 56x56, v1r2, Pascal
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 1;
-    constexpr index_t GemmDataPerReadB   = 1;
-
-    constexpr index_t InBlockCopy_ThreadPerDimC = 1;
-    constexpr index_t InBlockCopy_ThreadPerDimH = 2;
-    constexpr index_t InBlockCopy_ThreadPerDimW = 4;
-    constexpr index_t InBlockCopy_ThreadPerDimN = 4;
-    constexpr index_t InBlockCopyDataPerRead_N  = 4;
-
-    constexpr index_t WeiBlockCopyDataPerRead_K   = 4;
-    constexpr index_t OutThreadCopyDataPerWrite_N = 4;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 3x3, 28x28, v1r1, Pacal
-    constexpr index_t NPerBlock  = 32;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t InBlockCopy_ThreadPerDimC = 1;
-    constexpr index_t InBlockCopy_ThreadPerDimH = 4;
-    constexpr index_t InBlockCopy_ThreadPerDimW = 4;
-    constexpr index_t InBlockCopy_ThreadPerDimN = 8;
-    constexpr index_t InBlockCopyDataPerRead_N  = 4;
-
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 2;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 3x3, 28x28, v1r2, Pascal
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopyClusterLengths_CHWN       = Sequence<4, 2, 4, 4>;
-    constexpr index_t InBlockCopyDataPerRead_N = 4;
-
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-#elif 0
-    // for 1x1, 28x28, v1r1, Pascal
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t InBlockCopy_ThreadPerDimC = 8;
-    constexpr index_t InBlockCopy_ThreadPerDimH = 2;
-    constexpr index_t InBlockCopy_ThreadPerDimW = 2;
-    constexpr index_t InBlockCopy_ThreadPerDimN = 4;
-    constexpr index_t InBlockCopyDataPerRead_N  = 4;
-
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 2;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 1x1, 14x14, v1r1, Pascal
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 8;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-
-    constexpr index_t InBlockCopy_ThreadPerDimC = 8;
-    constexpr index_t InBlockCopy_ThreadPerDimH = 2;
-    constexpr index_t InBlockCopy_ThreadPerDimW = 2;
-    constexpr index_t InBlockCopy_ThreadPerDimN = 4;
-    constexpr index_t InBlockCopyDataPerRead_N  = 4;
-
-    constexpr index_t WeiBlockCopyDataPerRead_K   = 4;
-    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
-
-    constexpr index_t BlockSize = 128;
-#endif
-
-    constexpr index_t GridSize =
-        ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
-        ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        constexpr auto gridwise_conv =
-#if 0
-            GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
-#elif 0
-            GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
-#elif 0
-            GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
-#elif 1
-            GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
-#endif
-            <GridSize,
-             BlockSize,
-             T,
-             decltype(in_chwn_desc),
-             decltype(wei_cyxk_desc),
-             decltype(out_khwn_desc),
-             NPerBlock,
-             KPerBlock,
-             CPerBlock,
-             HoPerBlock,
-             WoPerBlock,
-             NPerThread,
-             KPerThread,
-             HoPerThread,
-             WoPerThread,
-             GemmMPerThreadSubC,
-             GemmNPerThreadSubC,
-             GemmMLevel0Cluster,
-             GemmNLevel0Cluster,
-             GemmMLevel1Cluster,
-             GemmNLevel1Cluster,
-             GemmKPerThreadLoop,
-             GemmDataPerReadA,
-             GemmDataPerReadB,
-             InBlockCopyClusterLengths_CHWN,
-             InBlockCopyDataPerRead_N,
-             WeiBlockCopyDataPerRead_K,
-             OutThreadCopyDataPerWrite_N>{};
-
-        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-
-    out_khwn_device_buf.FromDevice(out_khwn.mData.data());
-
-    // reorder output
-    auto f_reorder_khwn2nkhw = [&](auto k, auto ho, auto wo, auto n) {
-        out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_khwn2nkhw, K, Ho, Wo, N)(
-        std::thread::hardware_concurrency());
-}
--- a/driver/include/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
-#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
-
-using namespace ck;
-
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
-                                                        const Tensor<T>& in_nchw,
-                                                        WeiDesc,
-                                                        const Tensor<T>& wei_kcyx,
-                                                        OutDesc,
-                                                        Tensor<T>& out_nkhw,
-                                                        index_t nrepeat)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
-
-    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
-    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
-
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-
-    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
-    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
-    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
-    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
-
-    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
-    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
-
-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
-
-    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
-        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
-        std::thread::hardware_concurrency());
-
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
-
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-
-#if 0
-    // for 3x3, 34x34, v1r3, Pascal
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 16;
-
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 4;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<2, 1, 2, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 1, 16>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 1;
-
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_W = 2;
-#elif 0
-    // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 32
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t NPerBlock  = 1;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 32;
-
-    constexpr index_t NPerThread  = 1;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 8;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<1, 2, 2, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 4, 2, 32>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 1;
-
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_W = 4;
-#elif 1
-    // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 16
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 16;
-
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 4;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<2, 1, 2, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 2, 16>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 2;
-
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_W = 2;
-#elif 0
-    // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t NPerBlock  = 4;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 8;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 4, 8>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 4;
-
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_W = 1;
-#elif 0
-    // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 4
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t NPerBlock  = 8;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<2, 8, 4, 4>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 4;
-
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_W = 1;
-#elif 0
-    // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 2
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t NPerBlock  = 32;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<8, 8, 2, 2>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 4;
-
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_W = 1;
-#elif 1
-    // for 3x3, 28x28, v1r3, Pascal
-    constexpr index_t BlockSize = 128;
-
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 2;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
-    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<4, 8, 2, 2>;
-    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
-    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
-    constexpr index_t InBlockReorderDataPerWrite_N = 4;
-
-    using WeiBlockCopyClusterLengths            = void;
-    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
-
-    constexpr index_t OutThreadCopyDataPerWrite_W = 2;
-#endif
-
-    constexpr index_t GridSize =
-        ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
-        ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        constexpr auto gridwise_conv =
-#if 0
-            GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
-#else
-            GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
-#endif
-            <GridSize,
-             BlockSize,
-             T,
-             decltype(in_nchw_desc),
-             decltype(wei_cyxk_desc),
-             decltype(out_nkhw_desc),
-             NPerBlock,
-             KPerBlock,
-             CPerBlock,
-             HoPerBlock,
-             WoPerBlock,
-             NPerThread,
-             KPerThread,
-             HoPerThread,
-             WoPerThread,
-             GemmMPerThreadSubC,
-             GemmNPerThreadSubC,
-             GemmMLevel0Cluster,
-             GemmNLevel0Cluster,
-             GemmMLevel1Cluster,
-             GemmNLevel1Cluster,
-             GemmKPerThreadLoop,
-             GemmDataPerReadA,
-             GemmDataPerReadB,
-             InBlockReorderSrcSubLengths_NCHW,
-             InBlockReorderSrcClusterLengths_NCHW,
-             InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
-             InBlockReorderDataPerRead_W,
-             InBlockReorderDataPerWrite_N,
-             WeiBlockCopyClusterLengths,
-             WeiBlockCopyDataPerRead_K,
-             OutThreadCopyDataPerWrite_W>{};
-
-        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
--- a/driver/include/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
-#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
-
-using namespace ck;
-
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
-                                                        const Tensor<T>& in_nchw,
-                                                        WeiDesc,
-                                                        const Tensor<T>& wei_kcyx,
-                                                        OutDesc,
-                                                        Tensor<T>& out_nkhw,
-                                                        index_t nrepeat)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
-
-    constexpr index_t N  = in_nchw_desc.GetLength(I0);
-    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
-    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
-
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-
-    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
-    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
-    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
-    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
-
-    constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);
-
-    // convert in_nchw to in_cnhw
-    auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
-    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
-
-    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
-
-    make_ParallelTensorFunctor(
-        [&](auto n, auto c, auto hi, auto wi) { in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi); },
-        N,
-        C,
-        Hi,
-        Wi)(std::thread::hardware_concurrency());
-
-    // convert wei_kcyx to wei_cyxk
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
-    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
-
-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
-
-    make_ParallelTensorFunctor(
-        [&](auto k, auto c, auto y, auto x) { wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x); },
-        K,
-        C,
-        Y,
-        X)(std::thread::hardware_concurrency());
-
-    // conver out_nkhw to out_knhw
-    auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
-    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
-
-    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
-
-#if 0
-    // 3x3, 34x34
-    // need to use register double buffer for GEMM
-    constexpr index_t BPerBlock = 128;
-    constexpr index_t KPerBlock = 64;
-    constexpr index_t CPerBlock = 4;
-
-    constexpr index_t BPerThread = 8;
-    constexpr index_t KPerThread = 8;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 2;
-    constexpr index_t GemmNLevel1Cluster = 8;
-    constexpr index_t GemmKPerThreadLoop = 1;
-
-    constexpr index_t InBlockCopyThreadPerDim0 = 4;
-    constexpr index_t InBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t InBlockCopyDataPerRead  = 4;
-    constexpr index_t WeiBlockCopyDataPerRead = 4;
-    constexpr index_t OutThreadCopyDataPerWrite = 4;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // 1x1, 28x28, 64 threads
-    constexpr index_t BPerBlock = 64;
-    constexpr index_t KPerBlock = 64;
-    constexpr index_t CPerBlock = 8;
-
-    constexpr index_t BPerThread = 8;
-    constexpr index_t KPerThread = 8;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 2;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-
-    constexpr index_t GemmThreadPerColumnPerCluster = 8;
-    constexpr index_t GemmThreadPerRowPerCluster    = 8;
-
-    constexpr index_t InBlockCopyThreadPerDim0 = 4;
-    constexpr index_t InBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t InBlockCopyDataPerRead  = 4;
-    constexpr index_t WeiBlockCopyDataPerRead = 4;
-
-    constexpr index_t BlockSize = 64;
-#elif 0
-    // 1x1, 28x28, 128 threads, no lds-double-buffer
-    // 1x1, 28x28, 128 threads, with lds-double-buffer, max_register = 128
-    constexpr index_t BPerBlock = 64;
-    constexpr index_t KPerBlock = 128;
-    constexpr index_t CPerBlock = 8;
-
-    constexpr index_t BPerThread = 8;
-    constexpr index_t KPerThread = 8;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-
-    constexpr index_t GemmThreadPerColumnPerCluster = 8;
-    constexpr index_t GemmThreadPerRowPerCluster    = 8;
-
-    constexpr index_t InBlockCopyThreadPerDim0 = 4;
-    constexpr index_t InBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t InBlockCopyDataPerRead  = 4;
-    constexpr index_t WeiBlockCopyDataPerRead = 4;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // 1x1, 28x28, 256 thread
-    constexpr index_t BPerBlock = 128;
-    constexpr index_t KPerBlock = 128;
-    constexpr index_t CPerBlock = 8;
-
-    constexpr index_t BPerThread = 8;
-    constexpr index_t KPerThread = 8;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-
-    constexpr index_t GemmThreadPerColumnPerCluster = 8;
-    constexpr index_t GemmThreadPerRowPerCluster    = 8;
-
-    constexpr index_t InBlockCopyThreadPerDim0 = 4;
-    constexpr index_t InBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t InBlockCopyDataPerRead  = 4;
-    constexpr index_t WeiBlockCopyDataPerRead = 4;
-
-    constexpr index_t BlockSize = 256;
-#elif 0
-    // 1x1, 14x14, Pascal, enable lds_double_buffer, disable register double buffer
-    constexpr index_t BPerBlock = 64;
-    constexpr index_t KPerBlock = 128;
-    constexpr index_t CPerBlock = 8;
-
-    constexpr index_t BPerThread = 8;
-    constexpr index_t KPerThread = 8;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 2;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    constexpr index_t InBlockCopyThreadPerDim0 = 4;
-    constexpr index_t InBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t InBlockCopyDataPerRead    = 4;
-    constexpr index_t WeiBlockCopyDataPerRead   = 4;
-    constexpr index_t OutThreadCopyDataPerWrite = 4;
-
-    constexpr index_t BlockSize = 128;
-#elif 1
-    // 1x1, 14x14, Vega 20, enable lds_double_buffer, disable register_double_buffer
-    constexpr index_t BPerBlock = 128;
-    constexpr index_t KPerBlock = 128;
-    constexpr index_t CPerBlock = 8;
-
-    constexpr index_t BPerThread = 8;
-    constexpr index_t KPerThread = 8;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    constexpr index_t InBlockCopyThreadPerDim0 = 4;
-    constexpr index_t InBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
-
-    constexpr index_t InBlockCopyDataPerRead    = 4;
-    constexpr index_t WeiBlockCopyDataPerRead   = 4;
-    constexpr index_t OutThreadCopyDataPerWrite = 4;
-
-    constexpr index_t BlockSize = 256;
-#endif
-
-    constexpr index_t GridSize =
-        ((N * Hi * Wi + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    // mem
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_chwn_device_buf(data_sz * (in_chwn.mDesc.GetElementSpace() + BGhostRead +
-                                            BPerBlock)); // reserve extra space for BGhostRead
-    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
-    DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace());
-
-    in_chwn_device_buf.ToDevice(in_chwn.mData.data());
-    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
-    out_khwn_device_buf.ToDevice(out_khwn.mData.data());
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        constexpr auto gridwise_conv =
-#if 0
-            GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
-#else
-            GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
-#endif
-            <GridSize,
-             BlockSize,
-             T,
-             decltype(in_chwn_desc),
-             decltype(wei_cyxk_desc),
-             decltype(out_khwn_desc),
-             BPerBlock,
-             KPerBlock,
-             CPerBlock,
-             BPerThread,
-             KPerThread,
-             GemmMPerThreadSubC,
-             GemmNPerThreadSubC,
-             GemmMLevel0Cluster,
-             GemmNLevel0Cluster,
-             GemmMLevel1Cluster,
-             GemmNLevel1Cluster,
-             GemmKPerThreadLoop,
-             GemmDataPerReadA,
-             GemmDataPerReadB,
-             InBlockCopyThreadPerDim0,
-             InBlockCopyThreadPerDim1,
-             WeiBlockCopyThreadPerDim0,
-             WeiBlockCopyThreadPerDim1,
-             InBlockCopyDataPerRead,
-             WeiBlockCopyDataPerRead,
-             OutThreadCopyDataPerWrite>{};
-
-        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-
-    out_khwn_device_buf.FromDevice(out_khwn.mData.data());
-
-    // convert out_khwn to out_nkhw
-    make_ParallelTensorFunctor(
-        [&](auto n, auto k, auto ho, auto wo) { out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n); },
-        N,
-        K,
-        Ho,
-        Wo)(std::thread::hardware_concurrency());
-}
--- a/driver/include/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
-#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
-
-using namespace ck;
-
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
-                                                        const Tensor<T>& in_nchw,
-                                                        WeiDesc,
-                                                        const Tensor<T>& wei_kcyx,
-                                                        OutDesc,
-                                                        Tensor<T>& out_nkhw,
-                                                        index_t nrepeat)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
-
-    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
-    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
-
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-
-    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
-    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
-    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
-    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
-
-    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
-    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
-
-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
-
-    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
-        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
-        std::thread::hardware_concurrency());
-
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
-    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
-
-    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
-    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-
-    constexpr index_t N1 = 2;
-    constexpr index_t N2 = 4;
-
-    constexpr index_t B = (N * Ho * Wo) / (N1 * N2);
-
-#if 1
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t BPerBlock = 16;
-    constexpr index_t KPerBlock = 128;
-    constexpr index_t CPerBlock = 8;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopySubLengths_C_N1_B_N2     = Sequence<1, 1, 1, 4>;
-    using InBlockCopyClusterLengths_C_N1_B_N2 = Sequence<8, 2, 16, 1>;
-
-    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
-    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
-
-    using WeiBlockCopySubLengths_C_K     = Sequence<1, 4>;
-    using WeiBlockCopyClusterLengths_C_K = Sequence<8, 32>;
-
-    constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
-#endif
-
-    constexpr index_t GridSize =
-        ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        constexpr auto gridwise_conv =
-#if 0
-            GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
-#else
-            GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
-#endif
-            <GridSize,
-             BlockSize,
-             T,
-             decltype(in_nchw_desc),
-             decltype(wei_cyxk_desc),
-             decltype(out_nkhw_desc),
-             BPerBlock,
-             KPerBlock,
-             CPerBlock,
-             N1,
-             N2,
-             GemmMPerThreadSubC,
-             GemmNPerThreadSubC,
-             GemmMLevel0Cluster,
-             GemmNLevel0Cluster,
-             GemmMLevel1Cluster,
-             GemmNLevel1Cluster,
-             GemmKPerThreadLoop,
-             GemmDataPerReadA,
-             GemmDataPerReadB,
-             InBlockCopySubLengths_C_N1_B_N2,
-             InBlockCopyClusterLengths_C_N1_B_N2,
-             InBlockCopySrcDataPerRead_B,
-             InBlockCopyDstDataPerWrite_N2,
-             WeiBlockCopySubLengths_C_K,
-             WeiBlockCopyClusterLengths_C_K,
-             WeiBlockCopyDataPerAccess_K>{};
-
-#if 1
-        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   0,
-                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms, %f TFlop/s\n",
-               time,
-               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
-                   (std::size_t(1000) * 1000 * 1000) / time);
-        usleep(std::min(time * 1000, float(10000)));
-#endif
-    }
-
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
--- a/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
@@ -3,7 +3,6 @@
 #include "device.hpp"
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
 #include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"

 using namespace ck;
@@ -59,7 +58,6 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,

    constexpr index_t B = (N * Ho * Wo) / (N1 * N2);

-#if 1
    constexpr index_t BlockSize = 256;

    constexpr index_t BPerBlock = 16;
@@ -93,75 +91,6 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,

    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 0
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t BPerBlock = 16;
-    constexpr index_t KPerBlock = 128;
-    constexpr index_t EPerBlock = 8;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 1, 4, 1>;
-    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<8, 2, 4, 4>;
-    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
-    using InBlockCopySrcAccessOrder            = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
-    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
-
-    constexpr index_t InBlockCopySrcDataPerRead_B   = 4;
-    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 1;
-
-    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
-    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
-    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
-    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
-    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
-
-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
-    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 1
-    constexpr index_t BlockSize = 256;
-
-    constexpr index_t BPerBlock = 16;
-    constexpr index_t KPerBlock = 128;
-    constexpr index_t EPerBlock = 8;
-
-    constexpr index_t GemmMPerThreadSubC = 4;
-    constexpr index_t GemmNPerThreadSubC = 4;
-    constexpr index_t GemmMLevel0Cluster = 4;
-    constexpr index_t GemmNLevel0Cluster = 4;
-    constexpr index_t GemmMLevel1Cluster = 4;
-    constexpr index_t GemmNLevel1Cluster = 4;
-    constexpr index_t GemmKPerThreadLoop = 1;
-    constexpr index_t GemmDataPerReadA   = 4;
-    constexpr index_t GemmDataPerReadB   = 4;
-
-    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 1, 2, 2>;
-    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<8, 2, 8, 2>;
-    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
-    using InBlockCopySrcAccessOrder            = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
-    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
-
-    constexpr index_t InBlockCopySrcDataPerRead_B   = 2;
-    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 2;
-
-    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
-    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
-    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
-    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
-    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
-
-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
-    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#endif

    constexpr index_t GridSize =
        ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
@@ -171,47 +100,43 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
    for(index_t i = 0; i < nrepeat; ++i)
    {
        constexpr auto gridwise_conv =
-#if 0
-            GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
-#else
-            GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
-#endif
-            <GridSize,
-             BlockSize,
-             T,
-             decltype(in_nchw_desc),
-             decltype(wei_kcyx_desc),
-             decltype(out_nkhw_desc),
-             ConvStrides,
-             ConvDilations,
-             BPerBlock,
-             KPerBlock,
-             EPerBlock,
-             N1,
-             N2,
-             GemmMPerThreadSubC,
-             GemmNPerThreadSubC,
-             GemmMLevel0Cluster,
-             GemmNLevel0Cluster,
-             GemmMLevel1Cluster,
-             GemmNLevel1Cluster,
-             GemmKPerThreadLoop,
-             GemmDataPerReadA,
-             GemmDataPerReadB,
-             InBlockCopySubLengths_E_N1_B_N2,
-             InBlockCopyClusterLengths_E_N1_B_N2,
-             InBlockCopyThreadClusterArrangeOrder,
-             InBlockCopySrcAccessOrder,
-             InBlockCopyDstAccessOrder,
-             InBlockCopySrcDataPerRead_B,
-             InBlockCopyDstDataPerWrite_N2,
-             WeiBlockCopySubLengths_E_K,
-             WeiBlockCopyClusterLengths_E_K,
-             WeiBlockCopyThreadClusterArrangeOrder,
-             WeiBlockCopySrcAccessOrder,
-             WeiBlockCopyDstAccessOrder,
-             WeiBlockCopySrcDataPerRead_E,
-             WeiBlockCopyDstDataPerWrite_K>{};
+            GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer<
+                GridSize,
+                BlockSize,
+                T,
+                decltype(in_nchw_desc),
+                decltype(wei_kcyx_desc),
+                decltype(out_nkhw_desc),
+                ConvStrides,
+                ConvDilations,
+                BPerBlock,
+                KPerBlock,
+                EPerBlock,
+                N1,
+                N2,
+                GemmMPerThreadSubC,
+                GemmNPerThreadSubC,
+                GemmMLevel0Cluster,
+                GemmNLevel0Cluster,
+                GemmMLevel1Cluster,
+                GemmNLevel1Cluster,
+                GemmKPerThreadLoop,
+                GemmDataPerReadA,
+                GemmDataPerReadB,
+                InBlockCopySubLengths_E_N1_B_N2,
+                InBlockCopyClusterLengths_E_N1_B_N2,
+                InBlockCopyThreadClusterArrangeOrder,
+                InBlockCopySrcAccessOrder,
+                InBlockCopyDstAccessOrder,
+                InBlockCopySrcDataPerRead_B,
+                InBlockCopyDstDataPerWrite_N2,
+                WeiBlockCopySubLengths_E_K,
+                WeiBlockCopyClusterLengths_E_K,
+                WeiBlockCopyThreadClusterArrangeOrder,
+                WeiBlockCopySrcAccessOrder,
+                WeiBlockCopyDstAccessOrder,
+                WeiBlockCopySrcDataPerRead_E,
+                WeiBlockCopyDstDataPerWrite_K>{};

        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),

--- a/driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
-
-using namespace ck;
-
-template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
-void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
-                                                           const Tensor<TInWei>& in_nchw,
-                                                           WeiDesc,
-                                                           const Tensor<TInWei>& wei_kcyx,
-                                                           OutDesc,
-                                                           Tensor<TOut>& out_nkhw,
-                                                           index_t nrepeat)
-{
-    // this suppose in / wei data type is int8x4
-    constexpr index_t NVector = 4;
-    using accum_t             = int32_t;
-    using vector_t            = vector_type<TInWei, NVector>;
-    using vector_mem_t        = typename vector_t::MemoryType;
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
-
-    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
-    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
-
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-
-    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
-    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
-    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
-    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
-
-    // vectorized input
-    auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{});
-    ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: ");
-
-    Tensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
-
-    auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) {
-#if 0
-        in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w);
-#elif 0
-        in_nchw_vec(n, c, h, w) =
-            vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
-#elif 1
-        in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w),
-                                                 in_nchw(n, 4 * c + 1, h, w),
-                                                 in_nchw(n, 4 * c + 2, h, w),
-                                                 in_nchw(n, 4 * c + 3, h, w));
-#endif
-    };
-
-    make_ParallelTensorFunctor(f_vectorized_nchw, N, C / NVector, Hi, Wi)(
-        std::thread::hardware_concurrency());
-
-    // vectorize weight
-    auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{});
-    ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: ");
-
-    Tensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
-
-    auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) {
-#if 0
-        wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x);
-#elif 0
-        wei_kcyx_vec(k, c, y, x) =
-            vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x));
-#elif 1
-        wei_kcyx_vec(k, c, y, x) = vector_t::Pack(wei_kcyx(k, 4 * c, y, x),
-                                                  wei_kcyx(k, 4 * c + 1, y, x),
-                                                  wei_kcyx(k, 4 * c + 2, y, x),
-                                                  wei_kcyx(k, 4 * c + 3, y, x));
-#endif
-    };
-
-    make_ParallelTensorFunctor(f_vectorized_kcyx, K, C / NVector, Y, X)(
-        std::thread::hardware_concurrency());
-
-    //
-    DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(sizeof(TOut) * out_nkhw.mDesc.GetElementSpace());
-
-    in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data());
-    wei_kcyx_vec_device_buf.ToDevice(wei_kcyx_vec.mData.data());
-    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
-
-#if 0
-    // 3x3, 34x34, 128 thread, fp32, vector = 1
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 32;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 32;
-
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 4;
-    constexpr index_t CPerThread  = 2;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t InBlockCopyDataPerRead  = 2;
-    constexpr index_t WeiBlockCopyDataPerRead = 2;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // 3x3, 34x34, 128 thread, fp32, vector = 2
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 32;
-    constexpr index_t CPerBlock  = 2;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 32;
-
-    constexpr index_t NPerThread  = 2;
-    constexpr index_t KPerThread  = 4;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 2;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t InBlockCopyDataPerRead  = 2;
-    constexpr index_t WeiBlockCopyDataPerRead = 2;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // 3x3, 34x34, 128 thread, int8, vector = 4
-    constexpr index_t NPerBlock  = 2;
-    constexpr index_t KPerBlock  = 32;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 32;
-
-    constexpr index_t NPerThread  = 1;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t CPerThread  = 2;
-    constexpr index_t HoPerThread = 4;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t InBlockCopyDataPerRead  = 2;
-    constexpr index_t WeiBlockCopyDataPerRead = 2;
-
-    constexpr index_t BlockSize = 128;
-#elif 1
-    // 1x1, 32x32, 128 thread, int8, vector = 4
-    constexpr index_t NPerBlock  = 1;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 16;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 32;
-
-    constexpr index_t NPerThread  = 1;
-    constexpr index_t KPerThread  = 8;
-    constexpr index_t CPerThread  = 2;
-    constexpr index_t HoPerThread = 4;
-    constexpr index_t WoPerThread = 2;
-
-    constexpr index_t InBlockCopyDataPerRead  = 2;
-    constexpr index_t WeiBlockCopyDataPerRead = 2;
-
-    constexpr index_t BlockSize = 128;
-#endif
-
-    constexpr index_t GridSize =
-        (N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        float time = launch_kernel(
-            gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw<TInWei,
-                                                                    TOut,
-                                                                    accum_t,
-                                                                    decltype(in_nchw_vec_desc),
-                                                                    decltype(wei_kcyx_vec_desc),
-                                                                    decltype(out_nkhw_desc),
-                                                                    NVector,
-                                                                    NPerBlock,
-                                                                    KPerBlock,
-                                                                    CPerBlock,
-                                                                    HoPerBlock,
-                                                                    WoPerBlock,
-                                                                    NPerThread,
-                                                                    KPerThread,
-                                                                    CPerThread,
-                                                                    HoPerThread,
-                                                                    WoPerThread,
-                                                                    InBlockCopyDataPerRead,
-                                                                    WeiBlockCopyDataPerRead,
-                                                                    BlockSize,
-                                                                    GridSize>,
-            dim3(GridSize),
-            dim3(BlockSize),
-            static_cast<TInWei*>(in_nchw_vec_device_buf.GetDeviceBuffer()),
-            static_cast<TInWei*>(wei_kcyx_vec_device_buf.GetDeviceBuffer()),
-            static_cast<TInWei*>(out_nkhw_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms\n", time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-
-    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
-}
--- a/driver/include/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+++ b/driver/include/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
-#pragma once
-#include <unistd.h>
-#include "device.hpp"
-#include "tensor.hpp"
-#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
-
-using namespace ck;
-
-template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
-void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
-                                                              const Tensor<T>& in_nchw,
-                                                              WeiDesc,
-                                                              const Tensor<T>& wei_kcyx,
-                                                              OutDesc,
-                                                              Tensor<T>& out_nkhw,
-                                                              LowerPads,
-                                                              UpperPads,
-                                                              index_t nrepeat)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_nchw_desc  = InDesc{};
-    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto out_nkhw_desc = OutDesc{};
-
-    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
-    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
-
-    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
-    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
-    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
-
-    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
-    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
-    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
-    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
-
-    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
-    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
-
-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
-
-    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
-        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
-        std::thread::hardware_concurrency());
-
-    // reorder input
-    auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
-    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
-
-    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
-
-    auto f_reorder_nchw2chwn = [&](auto n, auto c, auto hi, auto wi) {
-        in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_nchw2chwn, N, C, Hi, Wi)(
-        std::thread::hardware_concurrency());
-
-    // output
-    auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
-    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
-
-    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
-
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_chwn_device_buf(data_sz * in_chwn.mDesc.GetElementSpace());
-    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
-    DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace());
-
-    in_chwn_device_buf.ToDevice(in_chwn.mData.data());
-    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
-    out_khwn_device_buf.ToDevice(out_khwn.mData.data());
-
-#if 0
-    constexpr index_t NPerBlock  = 1;
-    constexpr index_t KPerBlock  = 1;
-    constexpr index_t CPerBlock  = 1;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 1;
-    constexpr index_t KPerThread  = 1;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 1;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 1;
-
-    constexpr index_t BlockSize = 8;
-#elif 1
-    // for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 32;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // 3x3 58x58, NKC = 16,256,128
-    constexpr index_t NPerBlock  = 8;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 2;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 5x5, 36x36
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 2;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 7x7, 38x38
-    constexpr index_t NPerBlock  = 8;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 2;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 3x3, 56x56
-    constexpr index_t NPerBlock  = 32;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 4;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t BlockSize = 128;
-#elif 1
-    // 3x3 56x56, NKC = 16,256,128, with padding
-    // 3x3 28x28, NKC = 16,512,256, with padding
-    // 3x3 20x84, NKC = 16,256,256, with padding
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 2;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 2;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 64;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 5x5 filter, 20x84 image, 1x1 padding
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 64;
-    constexpr index_t CPerBlock  = 1;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // 5x5 filter, 28x28 image, 2x2 padding
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 32;
-    constexpr index_t CPerBlock  = 2;
-    constexpr index_t HoPerBlock = 4;
-    constexpr index_t WoPerBlock = 4;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t CPerThread  = 1;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t BlockSize = 128;
-#elif 0
-    // for 1x1, 28x28
-    constexpr index_t NPerBlock  = 16;
-    constexpr index_t KPerBlock  = 128;
-    constexpr index_t CPerBlock  = 8;
-    constexpr index_t HoPerBlock = 2;
-    constexpr index_t WoPerBlock = 2;
-
-    constexpr index_t NPerThread  = 4;
-    constexpr index_t KPerThread  = 16;
-    constexpr index_t CPerThread  = 2;
-    constexpr index_t HoPerThread = 1;
-    constexpr index_t WoPerThread = 1;
-
-    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
-    constexpr index_t WeiBlockCopyThreadPerDim1 = 32;
-
-    constexpr index_t BlockSize = 128;
-#endif
-
-    constexpr index_t GridSize =
-        ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
-        ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    for(index_t i = 0; i < nrepeat; ++i)
-    {
-        float time = launch_kernel(
-            gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded<GridSize,
-                                                                       BlockSize,
-                                                                       T,
-                                                                       decltype(in_chwn_desc),
-                                                                       decltype(wei_cyxk_desc),
-                                                                       decltype(out_khwn_desc),
-                                                                       LowerPads,
-                                                                       UpperPads,
-                                                                       NPerBlock,
-                                                                       KPerBlock,
-                                                                       CPerBlock,
-                                                                       HoPerBlock,
-                                                                       WoPerBlock,
-                                                                       NPerThread,
-                                                                       KPerThread,
-                                                                       CPerThread,
-                                                                       HoPerThread,
-                                                                       WoPerThread,
-                                                                       WeiBlockCopyThreadPerDim0,
-                                                                       WeiBlockCopyThreadPerDim1>,
-            dim3(GridSize),
-            dim3(BlockSize),
-
-            static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
-            static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
-            static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
-
-        printf("Elapsed time : %f ms\n", time);
-        usleep(std::min(time * 1000, float(10000)));
-    }
-
-    out_khwn_device_buf.FromDevice(out_khwn.mData.data());
-
-    // reorder output
-    auto f_reorder_khwn2nkhw = [&](auto k, auto ho, auto wo, auto n) {
-        out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n);
-    };
-
-    make_ParallelTensorFunctor(f_reorder_khwn2nkhw, K, Ho, Wo, N)(
-        std::thread::hardware_concurrency());
-}
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -7,11 +7,6 @@
 #include "ConstantTensorDescriptor.hpp"
 #include "device.hpp"
 #include "conv_common.hpp"
-#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
-#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
-#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"

 using namespace ck;
@@ -417,185 +412,6 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
 int main(int argc, char* argv[])
 {
 #if 0
-    constexpr index_t N  = 8;
-    constexpr index_t C  = 16;
-    constexpr index_t HI = 3;
-    constexpr index_t WI = 18;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 3x3, 34x34
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 34;
-    constexpr index_t WI = 34;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 3x3, 56x56
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 64;
-    constexpr index_t HI = 56;
-    constexpr index_t WI = 56;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 3x3 filter, 28x28 image
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 28;
-    constexpr index_t WI = 28;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 28x28 image
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 512;
-    constexpr index_t HI = 28;
-    constexpr index_t WI = 28;
-    constexpr index_t K  = 512;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 3x3 filter, 20x84 image, 1x1 padding
-    constexpr index_t N  = 16;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 20;
-    constexpr index_t WI = 84;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    constexpr index_t HPad = 1;
-    constexpr index_t WPad = 1;
-#elif 0
-    // 3x3 filter, 112x112 image, 1x1 padding
-    constexpr index_t N  = 16;
-    constexpr index_t C  = 64;
-    constexpr index_t HI = 112;
-    constexpr index_t WI = 112;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    constexpr index_t HPad = 1;
-    constexpr index_t WPad = 1;
-#elif 0
-    // 5x5 filter, 20x86 image
-    constexpr index_t N  = 16;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 20;
-    constexpr index_t WI = 86;
-    constexpr index_t K  = 512;
-    constexpr index_t Y  = 5;
-    constexpr index_t X  = 5;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 5x5 filter, 20x86 image, 1x1 padding
-    constexpr index_t N  = 16;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 20;
-    constexpr index_t WI = 86;
-    constexpr index_t K  = 512;
-    constexpr index_t Y  = 5;
-    constexpr index_t X  = 5;
-
-    constexpr index_t HPad = 1;
-    constexpr index_t WPad = 1;
-#elif 0
-    // 5x5 filter, 28x28 image, 2x2 padding
-    constexpr index_t N  = 16;
-    constexpr index_t C  = 192;
-    constexpr index_t HI = 28;
-    constexpr index_t WI = 28;
-    constexpr index_t K  = 32;
-    constexpr index_t Y  = 5;
-    constexpr index_t X  = 5;
-
-    constexpr index_t HPad = 2;
-    constexpr index_t WPad = 2;
-#elif 0
-    // 3x3 filter, 14x14 image
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 14x14 image
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 512;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 512;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 7x7 image
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 512;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
-    constexpr index_t K  = 2048;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
-    // 1x1 filter, 73x73 image
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 512;
-    constexpr index_t HI = 73;
-    constexpr index_t WI = 73;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
-#elif 0
    // 1x1 filter, 8x8 image
    // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
    constexpr index_t N  = 64;
@@ -611,7 +427,7 @@ int main(int argc, char* argv[])

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
-#elif 0
+#elif 1
    // 1x1 filter, 8x8 image
    // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
    constexpr index_t N  = 128;
@@ -837,63 +653,19 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-#if 0
-        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-#elif 0
-        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-#elif 0
-        in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-#elif 1
        in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
-#elif 0
-        in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
-
-        auto gen_wei = [](auto... is) {
-            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
-        };
-        wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
-#endif
    }

-#if 1
-#if 0
-    device_convolution_direct_v2_nchw_kcyx_nkhw
-#elif 0
-    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
-#elif 0
-    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
-#elif 0
-    device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
-#elif 0
-    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
-#elif 1
-    device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
-#endif
-    (in_nchw_desc,
-     in_nchw,
-     wei_kcyx_desc,
-     wei_kcyx,
-     out_nkhw_desc,
-     out_nkhw_device,
-     ConvStrides{},
-     ConvDilations{},
-     nrepeat);
-
-#elif 0
-    device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(in_nchw_desc,
-                                                             in_nchw,
-                                                             wei_kcyx_desc,
-                                                             wei_kcyx,
-                                                             out_nkhw_desc,
-                                                             out_nkhw_device,
-                                                             lower_pads,
-                                                             upper_pads,
-                                                             nrepeat);
-#endif
+    device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(in_nchw_desc,
+                                                       in_nchw,
+                                                       wei_kcyx_desc,
+                                                       wei_kcyx,
+                                                       out_nkhw_desc,
+                                                       out_nkhw_device,
+                                                       ConvStrides{},
+                                                       ConvDilations{},
+                                                       nrepeat);

    if(do_verification)
    {
@@ -915,12 +687,5 @@ int main(int argc, char* argv[])
                                    upper_pads);
        }
        check_error(out_nkhw_host, out_nkhw_device);
-
-#if 0
-        LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
-        LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
-        LogRange(std::cout << "out_nkhw_host  : ", out_nkhw_host.mData, ",") << std::endl;
-        LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
-#endif
    }
 }