minor changes

d5d3788f · Jianfeng yan · 43c22b57 · d5d3788f · d5d3788f · d5d3788f
Commit d5d3788f authored Mar 01, 2022 by Jianfeng yan
11 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
@@ -8,6 +8,7 @@
 #include "blockwise_gemm_xdlops.hpp"
 #include "blockwise_tensor_slice_transfer_v4r1.hpp"
 #include "threadwise_tensor_slice_transfer_sfcurve.hpp"
+// #include "threadwise_tensor_slice_transfer.hpp"
 #include "gridwise_gemm_pipeline_v1.hpp"
 namespace ck {
@@ -563,29 +564,28 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                n_thread_data_on_grid_to_n0_n1_n2_adaptor.CalculateBottomIndex(
                    make_multi_index(n_thread_data_on_grid));
-            auto c_thread_copy =
+            auto c_thread_copy = ThreadwiseTensorSliceTransfer_v1r3_sfcurve<
-                ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve<FloatAcc,
+                FloatAcc,
-                                                   FloatC,
+                FloatC,
-                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   decltype(c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   CElementwiseOperation,
+                CElementwiseOperation,
-                                                   Sequence<M0, N0, I1, I1, M2, I1, M4, I1>,
+                Sequence<M0, N0, I1, I1, M2, I1, M4, I1>,
-                                                   CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstAccessOrder,
-                                                   CThreadTransferSrcDstVectorDim,
+                CThreadTransferSrcDstVectorDim,
-                                                   CThreadTransferDstScalarPerVector,
+                CThreadTransferDstScalarPerVector,
-                                                   CGlobalMemoryDataOperation,
+                CGlobalMemoryDataOperation,
-                                                   1,
+                1,
-                                                   true>{
+                true>{c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                    c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(m_thread_data_on_grid_idx[I0],
-                    make_multi_index(m_thread_data_on_grid_idx[I0],
+                                       n_thread_data_on_grid_idx[I0],
-                                     n_thread_data_on_grid_idx[I0],
+                                       m_thread_data_on_grid_idx[I1],
-                                     m_thread_data_on_grid_idx[I1],
+                                       n_thread_data_on_grid_idx[I1],
-                                     n_thread_data_on_grid_idx[I1],
+                                       m_thread_data_on_grid_idx[I2],
-                                     m_thread_data_on_grid_idx[I2],
+                                       m_thread_data_on_grid_idx[I3],
-                                     m_thread_data_on_grid_idx[I3],
+                                       m_thread_data_on_grid_idx[I4],
-                                     m_thread_data_on_grid_idx[I4],
+                                       n_thread_data_on_grid_idx[I2]),
-                                     n_thread_data_on_grid_idx[I2]),
+                      c_element_op};
-                    c_element_op};
            c_thread_copy.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),

--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_sfcurve.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_sfcurve.hpp
@@ -30,7 +30,7 @@ template <typename SrcData,
          index_t DstScalarStrideInVector,
          bool DstResetCoordinateAfterRun,
          typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
-struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
+struct ThreadwiseTensorSliceTransfer_v1r3_sfcurve
 {
    static constexpr index_t nDim = SliceLengths::Size();
@@ -38,7 +38,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
-    __device__ constexpr ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve(
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v1r3_sfcurve(
        const DstDesc& dst_desc,
        const Index& dst_slice_origin_idx,
        const DstElementwiseOperation& dst_element_op)
@@ -98,7 +98,6 @@ struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
        constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
        static_for<0, num_accesses, 1>{}([&](auto idx_1d) {
            constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
            // copy data from src_buf into dst_vector
@@ -200,4 +199,3 @@ struct ThreadwiseTensorSliceTransfer_v1r3_using_space_filling_curve
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1_sfcurve.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1_sfcurve.hpp
@@ -78,8 +78,8 @@ struct ThreadwiseTensorSliceTransfer_v6r1
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
-                                                       DimAccessOrder,
+                                                    DimAccessOrder,
-                                                       remove_cv_t<decltype(scalar_per_access)>>;
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
        // loop over space-filling curve
        constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
@@ -198,4 +198,3 @@ struct ThreadwiseTensorSliceTransfer_v6r1
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r2_sfcurve.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r2_sfcurve.hpp
@@ -250,4 +250,3 @@ struct ThreadwiseTensorSliceTransfer_v6r2
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3_sfcurve.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3_sfcurve.hpp
@@ -300,4 +300,3 @@ struct ThreadwiseTensorSliceTransfer_v6r3
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -123,12 +123,11 @@ struct DynamicBuffer
        else if constexpr(Op == InMemoryDataOperationEnum_t::AtomicAdd)
        {
            this->template AtomicAdd<X>(i, is_valid_element, x);
        }
        else if constexpr(Op == InMemoryDataOperationEnum_t::Add)
        {
            auto tmp = this->template Get<X>(i, is_valid_element);
-            this->template Set<X>(i, is_valid_element, x+tmp);
+            this->template Set<X>(i, is_valid_element, x + tmp);
            // tmp += x;
            // this->template Set<X>(i, is_valid_element, tmp);
        }

--- a/composable_kernel/include/utility/tensor_space_filling_curve.hpp
+++ b/composable_kernel/include/utility/tensor_space_filling_curve.hpp
@@ -42,7 +42,8 @@ struct SpaceFillingCurve
    }
    template <index_t AccessIdx1dBegin, index_t AccessIdx1dEnd>
-    static __device__ __host__ constexpr auto GetStepBetween(Number<AccessIdx1dBegin>, Number<AccessIdx1dEnd>)
+    static __device__ __host__ constexpr auto GetStepBetween(Number<AccessIdx1dBegin>,
+                                                             Number<AccessIdx1dEnd>)
    {
        static_assert(AccessIdx1dBegin >= 0, "1D index should be non-negative");
        static_assert(AccessIdx1dBegin < GetNumOfAccess(), "1D index should be larger than 0");

--- a/device_operation/include/conv_utils.hpp
+++ b/device_operation/include/conv_utils.hpp
@@ -39,12 +39,12 @@ std::size_t GetFlops(ck::index_t N,
           std::accumulate(std::begin(output_spatial_lengths),
                           std::end(output_spatial_lengths),
                           static_cast<std::size_t>(1),
-                           std::multiplies<std::size_t>()) * 
+                           std::multiplies<std::size_t>()) *
           C *
           std::accumulate(std::begin(filter_spatial_lengths),
                           std::end(filter_spatial_lengths),
                           static_cast<std::size_t>(1),
-                           std::multiplies<std::size_t>());       
+                           std::multiplies<std::size_t>());
 }
 /**

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -48,7 +48,3 @@ target_link_libraries(gemm_xdl_alpha_beta PRIVATE host_tensor)
 target_link_libraries(conv2d_fwd_xdl_int8 PRIVATE host_tensor)
 target_link_libraries(conv3d_fwd_xdl PRIVATE host_tensor)
 target_link_libraries(convnd_fwd_xdl PRIVATE host_tensor)
-set(CONV2D_FWD_XDL_SFCURVE_SOURCE conv2d_fwd_xdl_sfcurve/conv2d_fwd_xdl_sfcurve.cpp)
-add_executable(conv2d_fwd_xdl_sfcurve ${CONV2D_FWD_XDL_SFCURVE_SOURCE})
-target_link_libraries(conv2d_fwd_xdl_sfcurve PRIVATE host_tensor)
--- a/example/conv2d_fwd_xdl_sfcurve/README.md
+++ b/example/conv2d_fwd_xdl_sfcurve/README.md
-# Instructions for ```conv2d_fwd_xdl``` Example
-## Docker script
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```conv2d_fwd_xdl```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j conv2d_fwd_xdl
-```
-## Run ```conv2d_fwd_xdl_int8```
-```bash
-#arg1: verification (0=no, 1=yes)
-#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
-#arg3: run kernel # of times (>1)
-#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./example/conv2d_fwd_xdl_int8 0 1 5
-```
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
-in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
-arg.b_grid_desc_k0_n_k1_{216, 256, 8}
-arg.c_grid_desc_m_n_{ 165888, 256}
-launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
-Warm up
-Start running 5 times...
-Perf: 1.43206 ms, 102.486 TFlops, 232.947 GB/s
-```
--- a/example/conv2d_fwd_xdl_sfcurve/conv2d_fwd_xdl_sfcurve.cpp
+++ b/example/conv2d_fwd_xdl_sfcurve/conv2d_fwd_xdl_sfcurve.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include "config.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "device_tensor.hpp"
-#include "tensor_layout.hpp"
-#include "device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp"
-#include "element_wise_operation.hpp"
-#include "reference_conv_fwd.hpp"
-#include "convolution_utility.hpp"
-// using InDataType  = int8_t;
-// using WeiDataType = int8_t;
-// using OutDataType = int8_t;
-// using AccDataType = int32_t;
-using InDataType  = ck::half_t;
-using WeiDataType = ck::half_t;
-using OutDataType = ck::half_t;
-using AccDataType = float;
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-using InLayout  = ck::tensor_layout::convolution::NHWC;
-using WeiLayout = ck::tensor_layout::convolution::KYXC;
-using OutLayout = ck::tensor_layout::convolution::NHWK;
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
-using DeviceConvFwdInstance = ck::tensor_operation::device::
-    DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
-        InDataType,     // InDataType
-        WeiDataType,    // WeiDataType
-        OutDataType,    // OutDataType
-        AccDataType,    // AccDataType
-        PassThrough,    // InElementwiseOperation
-        PassThrough,    // WeiElementwiseOperation
-        PassThrough,    // OutElementwiseOperation
-        ConvFwdDefault, // ConvForwardSpecialization
-        256,            // BlockSize
-        128,            // MPerBlock
-        256,            // NPerBlock
-        4,              // K0PerBlock
-        8,             // K1
-        32,             // MPerXdl
-        32,             // NPerXdl
-        2,              // MXdlPerWave
-        4,              // NXdlPerWave
-        S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_K0_M_K1
-        S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
-        2,              // ABlockTransferSrcVectorDim
-        8,             // ABlockTransferSrcScalarPerVector
-        8,             // ABlockTransferDstScalarPerVector_K1
-        true,           // ABlockLdsAddExtraM
-        S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_K0_N_K1
-        S<1, 0, 2>,     // BBlockTransferThreadClusterArrangeOrder
-        S<1, 0, 2>,     // BBlockTransferSrcAccessOrder
-        2,              // BBlockTransferSrcVectorDim
-        8,             // BBlockTransferSrcScalarPerVector
-        8,             // BBlockTransferDstScalarPerVector_K1
-        true,           // BBlockLdsAddExtraN
-        7,              // CThreadTransferSrcDstVectorDim
-        1>;             // CThreadTransferDstScalarPerVector
-using ReferenceConvFwdInstance = ck::tensor_operation::host::
-    ReferenceConvFwd<InDataType, WeiDataType, OutDataType, InElementOp, WeiElementOp, OutElementOp>;
-int main(int argc, char* argv[])
-{
-    bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
-    // Conv shape
-    ck::index_t N               = 128;
-    ck::index_t K               = 256;
-    ck::index_t C               = 192;
-    ck::index_t Y               = 3;
-    ck::index_t X               = 3;
-    ck::index_t Hi              = 71;
-    ck::index_t Wi              = 71;
-    ck::index_t conv_stride_h   = 2;
-    ck::index_t conv_stride_w   = 2;
-    ck::index_t conv_dilation_h = 1;
-    ck::index_t conv_dilation_w = 1;
-    ck::index_t in_left_pad_h   = 1;
-    ck::index_t in_left_pad_w   = 1;
-    ck::index_t in_right_pad_h  = 1;
-    ck::index_t in_right_pad_w  = 1;
-    if(argc == 4)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
-    }
-    else if(argc == 19)
-    {
-        do_verification = std::stoi(argv[1]);
-        init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
-        N               = std::stoi(argv[4]);
-        K               = std::stoi(argv[5]);
-        C               = std::stoi(argv[6]);
-        Y               = std::stoi(argv[7]);
-        X               = std::stoi(argv[8]);
-        Hi              = std::stoi(argv[9]);
-        Wi              = std::stoi(argv[10]);
-        conv_stride_h   = std::stoi(argv[11]);
-        conv_stride_w   = std::stoi(argv[12]);
-        conv_dilation_h = std::stoi(argv[13]);
-        conv_dilation_w = std::stoi(argv[14]);
-        in_left_pad_h   = std::stoi(argv[15]);
-        in_left_pad_w   = std::stoi(argv[16]);
-        in_right_pad_h  = std::stoi(argv[17]);
-        in_right_pad_w  = std::stoi(argv[18]);
-    }
-    else
-    {
-        printf("arg1: verification (0=no, 1=yes)\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3: run kernel # of times (>1)\n");
-        printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
-               "RightPx\n");
-        exit(0);
-    }
-    const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
-    const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
-    const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
-    const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
-    const auto output_spatial_lengths =
-        ck::tensor_operation::ConvolutionUtility::ComputeOutputSpatialLengths({Hi, Wi},
-                                                                              {Y, X},
-                                                                              conv_filter_strides,
-                                                                              conv_filter_dilations,
-                                                                              input_left_pads,
-                                                                              input_right_pads);
-    const ck::index_t Ho = output_spatial_lengths[0];
-    const ck::index_t Wo = output_spatial_lengths[1];
-    // tensor layout
-    auto f_host_tensor_descriptor = [](std::size_t N_,
-                                       std::size_t C_,
-                                       std::size_t H,
-                                       std::size_t W,
-                                       auto layout) {
-        if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
-                     ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
-                     ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                        std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
-        }
-        else if constexpr(ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::NHWC>::value ||
-                          ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::KYXC>::value ||
-                          ck::is_same<decltype(layout),
-                                      ck::tensor_layout::convolution::NHWK>::value)
-        {
-            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                        std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
-        }
-    };
-    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
-    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
-    Tensor<OutDataType> out_n_k_ho_wo_host_result(
-        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
-    Tensor<OutDataType> out_n_k_ho_wo_device_result(
-        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
-    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
-    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
-    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-1, 1});
-        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-1, 1});
-        break;
-    default:
-        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0, 1});
-        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1, 1});
-    }
-    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(sizeof(OutDataType) *
-                             out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
-    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-    {
-        memset(out_n_k_ho_wo_device_result.mData.data(), static_cast<OutDataType>(0), out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
-        out_device_buf.ToDevice(out_n_k_ho_wo_device_result.mData.data());
-    }
-    // do GEMM
-    auto conv     = DeviceConvFwdInstance{};
-    auto invoker  = conv.MakeInvoker();
-    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                      N,
-                                      K,
-                                      C,
-                                      std::vector<ck::index_t>{Hi, Wi},
-                                      std::vector<ck::index_t>{Y, X},
-                                      std::vector<ck::index_t>{Ho, Wo},
-                                      conv_filter_strides,
-                                      conv_filter_dilations,
-                                      input_left_pads,
-                                      input_right_pads,
-                                      InElementOp{},
-                                      WeiElementOp{},
-                                      OutElementOp{});
-    if(!conv.IsSupportedArgument(argument))
-    {
-        throw std::runtime_error(
-            "wrong! device_conv with the specified compilation parameters does "
-            "not support this Conv problem");
-    }
-    float ave_time = invoker.Run(argument, nrepeat);
-    std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
-    std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
-                            sizeof(WeiDataType) * (K * C * Y * X) +
-                            sizeof(OutDataType) * (N * K * Ho * Wo);
-    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-    float gb_per_sec = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
-              << std::endl;
-    if(do_verification)
-    {
-        auto ref_conv    = ReferenceConvFwdInstance{};
-        auto ref_invoker = ref_conv.MakeInvoker();
-        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
-                                                  wei_k_c_y_x,
-                                                  out_n_k_ho_wo_host_result,
-                                                  conv_filter_strides,
-                                                  conv_filter_dilations,
-                                                  input_left_pads,
-                                                  input_right_pads,
-                                                  InElementOp{},
-                                                  WeiElementOp{},
-                                                  OutElementOp{});
-        ref_invoker.Run(ref_argument);
-        out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
-        check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
-    }
-}