experimenting TensorCoordinate and new merged tensor copy operator

a9a392b4 · Chao Liu · 2eeeb176 · a9a392b4 · a9a392b4 · a9a392b4
Commit a9a392b4 authored Aug 01, 2019 by Chao Liu
6 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
        static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not satisfied");
+#if 0 // debug
        // input blockwise copy
        //     slice a merged tensor, reorder and copy to a normal tensor
        //     this copy operator already has blockwise offset built-in
@@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
                                               InBlockCopySrcDataPerRead_B,
                                               InBlockCopyDstDataPerWrite_N2>(
                {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
+#else
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2<
+            BlockSize,
+            Float,
+            decltype(in_e_n1_b_n2_global_merged_desc),
+            decltype(in_e_n1_b_n2_block_desc),
+            MergedTensorCoordinate<decltype(in_e_n1_b_n2_global_merged_desc)>,
+            NormalTensorCoordinate<decltype(in_e_n1_b_n2_block_desc)>,
+            decltype(in_e_n1_b_n2_block_desc.GetLengths()),
+            InBlockCopySubLengths_E_N1_B_N2,
+            InBlockCopyClusterLengths_E_N1_B_N2,
+            InBlockCopyThreadClusterArrangeOrder>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
+#endif
        // weight tensor
        //     tensor descriptor in device memory, src of blockwise copy
@@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
            Sequence<EPerBlock, KPerBlock>{},
            Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
+#if 0 // debug
        // operator for blockwise copy of weight into LDS
        //     slice a tensor, and copy it into another tensor
        //     this copy operator already have blockwise offset built-in
@@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
                                               WeiBlockCopySrcDataPerRead_E,
                                               WeiBlockCopyDstDataPerWrite_K>(
                {0, k_block_data_on_global}, {0, 0});
+#else
+        auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2<
+            BlockSize,
+            Float,
+            decltype(wei_e_k_global_desc),
+            decltype(wei_e_k_block_desc),
+            NormalTensorCoordinate<decltype(wei_e_k_global_desc)>,
+            NormalTensorCoordinate<decltype(wei_e_k_block_desc)>,
+            decltype(wei_e_k_block_desc.GetLengths()),
+            WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0});
+#endif
        // GEMM definition
        // c_mtx += transpose(a_mtx) * b_mtx
@@ -277,8 +305,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
            __syncthreads();
+#if 0
            blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
            blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
+#else
+            blockwise_in_copy.MoveSrcSlicingWindow({EPerBlock, 0, 0, 0}, true);
+            blockwise_wei_copy.MoveSrcSlicingWindow({EPerBlock, 0}, true);
+#endif
        }
        // copy output: register to global memory
@@ -328,6 +361,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
                out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
                    k_thread_data_on_global, 0, b_thread_data_on_global, 0);
+#if 0 // debug
            threadwise_generic_tensor_slice_copy_v1(
                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
                p_out_thread,
@@ -338,6 +372,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
                out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
                arithmetic_sequence_gen<0, 8, 1>::type{},
                Number<1>{});
+#else
+            ThreadwiseGenericTensorSliceCopy_v2<
+                Float,
+                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
+                decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
+                NormalTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc)>,
+                MergedTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc)>,
+                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>(
+                {0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
+                .Run(p_out_thread, p_out_thread_on_global);
+#endif
        }
    }
 };

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -301,40 +301,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
                                                        b_thread_data_on_global % B1});
            threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global);
-#elif 0
-            // This is a hack, because slicing a merged dimension is not supported yet.
-            // This should be replaced with logic above, once slicing a merged dimension support
-            // become available
-            //     dst descriptor
-            constexpr auto out_k0_k1_b_global_desc =
-                make_ConstantMergedTensorDescriptor(out_n_k_h_w_global_desc.Fold(I1, Number<K1>{}),
-                                                    Sequence<1>{},
-                                                    Sequence<2>{},
-                                                    Sequence<0, 3, 4>{});
-            //     src descriptor
-            constexpr auto out_k0_k1_b_thread_desc = make_ConstantTensorDescriptor_packed(
-                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNRepeat * GemmNPerThreadSubC>{});
-            auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2<
-                Float,
-                decltype(out_k0_k1_b_thread_desc),
-                decltype(out_k0_k1_b_global_desc),
-                NormalTensorCoordinate<decltype(out_k0_k1_b_thread_desc)>,
-                MergedTensorCoordinate<decltype(out_k0_k1_b_global_desc)>,
-                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNPerThreadSubC>>(
-                {0, 0, 0},
-                {k_thread_data_on_global / K1,
-                 k_thread_data_on_global % K1,
-                 b_thread_data_on_global});
-            for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
-            {
-                threadwise_out_copy.Run(p_out_thread, p_out_global);
-                threadwise_out_copy.MoveSrcSlicingWindow({0, 0, GemmNPerThreadSubC}, true);
-                threadwise_out_copy.MoveDstSlicingWindow({0, 0, B1}, true);
-            }
 #elif 1
            // This is a hack, because slicing a merged dimension is not supported yet.
            // This should be replaced with logic above, once slicing a merged dimension support

--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -16,7 +16,7 @@ struct NormalTensorCoordinate
    static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
    __host__ __device__ constexpr NormalTensorCoordinate(Array<index_t, nDim> tensor_index)
-        : mIndex{tensor_index}, mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)}
+        : mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)}
    {
    }
@@ -26,38 +26,15 @@ struct NormalTensorCoordinate
    {
    }
-    __host__ __device__ constexpr Array<unsigned, nDim> GetIndex() const { return mIndex; }
    __host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
-    template <class IDim, bool PositiveDirection>
-    __host__ __device__ void
-    MoveOnDimension(IDim idim, index_t step_size, integral_constant<bool, PositiveDirection>)
-    {
-        if(PositiveDirection)
-        {
-            mIndex(idim) += step_size;
-            mOffset += step_size * tensor_desc_type::GetStride(idim);
-        }
-        else
-        {
-            mIndex(idim) -= step_size;
-            mOffset -= step_size * tensor_desc_type::GetStride(idim);
-        }
-    }
    // T is Array or Sequence
    template <class T>
    __host__ __device__ type operator+=(T step_sizes)
    {
-#if 0
+        static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
-        static_assert(is_same<typename T::data_type, index_t>, "wrong!");
-#endif
-        static_assert(T::GetSize() == nDim, "wrong!");
-        static_for<0, nDim, 1>{}([&](auto idim) {
+        mOffset += tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
-            this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{});
-        });
        return *this;
    }
@@ -65,14 +42,9 @@ struct NormalTensorCoordinate
    template <class T>
    __host__ __device__ type operator-=(T step_sizes)
    {
-#if 0
+        static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
-        static_assert(is_same<typename T::data_type, index_t>, "wrong!");
-#endif
-        static_assert(T::GetSize() == nDim, "wrong!");
-        static_for<0, nDim, 1>{}([&](auto idim) {
+        mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
-            this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{});
-        });
        return *this;
    }
@@ -93,19 +65,25 @@ struct NormalTensorCoordinate
        return coord;
    }
-    // reposition point of origin, and return compensated offset
+    // reposition point of origin, and return compensated offset.
+    // This is a hack to reduce index calculation during looping over
+    // a tensor whose origin is this TensorCoordinate. It does so, by spitting
+    // out the run-time offset to the pointer (to the tensor data) held by this
+    // TensorCoordiante, so the caller can add the offset into the run-time pointer of
+    // the data, so only 1 run-time variable (update pointer) is needed, instead
+    // of 2 run-time variables (old pointer and this offset)
+    // TODO: after introducing the concept of "run-time tensor view", which contains the
+    // run-time pointer to the data, always keep track of the pointer, instead of both
+    // offset and the pointer. This also bring additional benefit that we don't need to
+    // worry the offset might underflow (because offset is unsigned integer) when updating it.
    __host__ __device__ constexpr index_t RepositionOrigin()
    {
        index_t offset_diff = mOffset;
+        mOffset             = 0;
-        mIndex  = make_zero_array<index_t, nDim>();
-        mOffset = 0;
        return offset_diff;
    }
-    // private:
+    private:
-    Array<index_t, nDim> mIndex;
    index_t mOffset;
 };
@@ -120,8 +98,7 @@ struct MergedTensorCoordinate
        tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension();
    __host__ __device__ constexpr MergedTensorCoordinate(Array<index_t, nDim> tensor_index)
-        : mIndex{tensor_index},
+        : mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
-          mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
    {
        // partial offset on each dimension
        static_for<0, nDim, 1>{}([&](auto idim) {
@@ -146,8 +123,6 @@ struct MergedTensorCoordinate
    {
    }
-    __host__ __device__ constexpr Array<index_t, nDim> GetIndex() const { return mIndex; }
    __host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
    // step_size should be known at compile time
@@ -157,17 +132,7 @@ struct MergedTensorCoordinate
    {
        constexpr auto idim = IDim{};
-        // update multi-index
+        // update original index
-        if(PositiveDirection)
-        {
-            mIndex(idim) += step_size;
-        }
-        else
-        {
-            mIndex(idim) -= step_size;
-        }
-        // update rest
        static_if<tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) {
            constexpr auto partial_original_dims =
                tensor_desc_type::GetContainedOriginalDimensions(idim);
@@ -253,19 +218,10 @@ struct MergedTensorCoordinate
            // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
            mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset;
-        }).Else([&](auto) {
+        }).Else([&](auto fwd) {
-            constexpr auto idim_original =
+            static_if<PositiveDirection>{}([&](auto) {
-                tensor_desc_type::GetContainedOriginalDimensions(idim).Front();
-            static_if<PositiveDirection>{}([&](auto fwd) {
-                mOriginalIndex(idim_original) += step_size;
-                mPartialOffsets(idim) += step_size * fwd(tensor_desc_type{}).GetStride(idim);
                mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim);
-            }).Else([&](auto fwd) {
+            }).Else([&](auto) { mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); });
-                mOriginalIndex(idim_original) -= step_size;
-                mPartialOffsets(idim) -= step_size * fwd(tensor_desc_type{}).GetStride(idim);
-                mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim);
-            });
        });
    }
@@ -273,10 +229,9 @@ struct MergedTensorCoordinate
    template <class T>
    __host__ __device__ type operator+=(T step_sizes)
    {
-#if 0
+        static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
-        static_assert(is_same<typename T::data_type, index_t>, "wrong!");
-#endif
+        index_t normal_offset_diff = 0;
-        static_assert(T::GetSize() == nDim, "wrong!");
        static_for<0, nDim, 1>{}([&](auto idim) {
            this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{});
@@ -288,10 +243,7 @@ struct MergedTensorCoordinate
    template <class T>
    __host__ __device__ type operator-=(T step_sizes)
    {
-#if 0
+        static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
-        static_assert(is_same<typename T::data_type, index_t>, "wrong!");
-#endif
-        static_assert(T::GetSize() == nDim, "wrong!");
        static_for<0, nDim, 1>{}([&](auto idim) {
            this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{});
@@ -316,33 +268,23 @@ struct MergedTensorCoordinate
        return coord;
    }
-    // reposition point of origin, and return compensated offset
+    __host__ __device__ static constexpr index_t RepositionOrigin() { return 0; }
-    __host__ __device__ constexpr index_t RepositionOrigin()
-    {
+    private:
-        index_t offset_diff = 0;
+    // Allocate register memory for all merged dimensions and normal dimensions.
+    // However, only those merged dimensions, whose index will be involved in arithmetic
-        static_for<0, nDim, 1>{}([&](auto idim_) {
+    // after the construction of this TensorCoordinate (e.g. when user move a slicing
-            constexpr auto idim = decltype(idim_){};
+    // window on the merged dimension), will use these register memory.
+    // Let's hope compiler will optimize away those register memory allocated for normal
-            static_if<!tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) {
+    // dimensions, and those merged dimensions, that would never be involved in index
-                constexpr auto idim_original =
+    // arithmetic after construction of TensorCoordinate.
-                    tensor_desc_type::GetContainedOriginalDimensions(idim).Front();
+    // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
+    // and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
-                mIndex(idim)                  = 0;
+    // count on compiler to optimize way those register memory for us
-                mOriginalIndex(idim_original) = 0;
-                mOffset -= mPartialOffsets[idim];
-                offset_diff += mPartialOffsets[idim];
-                mPartialOffsets(idim) = 0;
-            });
-        });
-        return offset_diff;
-    }
-    // private:
-    Array<index_t, nDim> mIndex;
    Array<index_t, nOriginalDim> mOriginalIndex;
-    Array<index_t, nDim> mPartialOffsets; // mPartialOffsets is needed for for unsigned index type
+    Array<index_t, nDim> mPartialOffsets;
+    // complete offset
    index_t mOffset;
 };

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    for(index_t i = 0; i < nrepeat; ++i)
    {
        constexpr auto gridwise_conv =
-#if 0
+#if 1
            GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
 #else
            GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer

--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -85,6 +85,40 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+#elif 1
+    // 1x1 filter, 8x8 image
+    constexpr index_t BlockSize = 256;
+    constexpr index_t BPerBlock = 128;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t EPerBlock = 8;
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+    using InBlockCopySubLengths_E_B            = Sequence<2, 2>;
+    using InBlockCopyClusterLengths_E_B        = Sequence<4, 64>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+    constexpr index_t InBlockCopyDataPerAccess_B = 1;
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #endif
@@ -96,43 +130,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+    constexpr auto gridwise_conv =
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw<GridSize,
+                                                            BlockSize,
+                                                            T,
+                                                            decltype(in_nchw_desc),
+                                                            decltype(wei_kcyx_desc),
+                                                            decltype(out_nkhw_desc),
+                                                            ConvStrides,
+                                                            ConvDilations,
+                                                            BPerBlock,
+                                                            KPerBlock,
+                                                            EPerBlock,
+                                                            GemmMPerThreadSubC,
+                                                            GemmNPerThreadSubC,
+                                                            GemmMLevel0Cluster,
+                                                            GemmNLevel0Cluster,
+                                                            GemmMLevel1Cluster,
+                                                            GemmNLevel1Cluster,
+                                                            GemmKPerThreadLoop,
+                                                            GemmDataPerReadA,
+                                                            GemmDataPerReadB,
+                                                            InBlockCopySubLengths_E_B,
+                                                            InBlockCopyClusterLengths_E_B,
+                                                            InBlockCopyThreadClusterArrangeOrder,
+                                                            InBlockCopySrcAccessOrder,
+                                                            InBlockCopyDstAccessOrder,
+                                                            InBlockCopyDataPerAccess_B,
+                                                            WeiBlockCopySubLengths_E_K,
+                                                            WeiBlockCopyClusterLengths_E_K,
+                                                            WeiBlockCopyThreadClusterArrangeOrder,
+                                                            WeiBlockCopySrcAccessOrder,
+                                                            WeiBlockCopyDstAccessOrder,
+                                                            WeiBlockCopySrcDataPerRead_E,
+                                                            WeiBlockCopyDstDataPerWrite_K>{};
    for(index_t i = 0; i < nrepeat; ++i)
    {
-        constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw<
-            GridSize,
-            BlockSize,
-            T,
-            decltype(in_nchw_desc),
-            decltype(wei_kcyx_desc),
-            decltype(out_nkhw_desc),
-            ConvStrides,
-            ConvDilations,
-            BPerBlock,
-            KPerBlock,
-            EPerBlock,
-            GemmMPerThreadSubC,
-            GemmNPerThreadSubC,
-            GemmMLevel0Cluster,
-            GemmNLevel0Cluster,
-            GemmMLevel1Cluster,
-            GemmNLevel1Cluster,
-            GemmKPerThreadLoop,
-            GemmDataPerReadA,
-            GemmDataPerReadB,
-            InBlockCopySubLengths_E_B,
-            InBlockCopyClusterLengths_E_B,
-            InBlockCopyThreadClusterArrangeOrder,
-            InBlockCopySrcAccessOrder,
-            InBlockCopyDstAccessOrder,
-            InBlockCopyDataPerAccess_B,
-            WeiBlockCopySubLengths_E_K,
-            WeiBlockCopyClusterLengths_E_K,
-            WeiBlockCopyThreadClusterArrangeOrder,
-            WeiBlockCopySrcAccessOrder,
-            WeiBlockCopyDstAccessOrder,
-            WeiBlockCopySrcDataPerRead_E,
-            WeiBlockCopyDstDataPerWrite_K>{};
        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
                                   dim3(BlockSize),

--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -71,7 +71,7 @@ int main(int argc, char* argv[])
 {
    using namespace ck;
-#if 0
+#if 1
    constexpr index_t N  = 64;
    constexpr index_t C  = 1536;
    constexpr index_t HI = 8;