refactoring array

9535f806 · Chao Liu · c98cbea0 · 9535f806 · 9535f806 · 9535f806
Commit 9535f806 authored Sep 26, 2020 by Chao Liu
13 changed files
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v1.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v1.hpp
@@ -18,29 +18,34 @@ map_convolution_into_gemm_v1(const WeiDesc& wei_k_c_y_x_global_desc,
                             const Array<index_t, 2> in_left_pads,
                             const Array<index_t, 2> in_right_pads)
 {
-    const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
-    const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
-    const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
+    constexpr auto i0 = Number<0>{};
+    constexpr auto i1 = Number<1>{};
+    constexpr auto i2 = Number<2>{};
+    constexpr auto i3 = Number<3>{};

-    const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
-    const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
+    const index_t N = in_n_c_hi_wi_global_desc.GetLength(i0);
+    const index_t C = in_n_c_hi_wi_global_desc.GetLength(i1);
+    const index_t K = out_n_k_ho_wo_global_desc.GetLength(i1);

-    const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
-    const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
+    const index_t Y = wei_k_c_y_x_global_desc.GetLength(i2);
+    const index_t X = wei_k_c_y_x_global_desc.GetLength(i3);

-    const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
-    const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
+    const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(i2);
+    const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(i3);

-    const index_t ConvStrideH = conv_strides[0];
-    const index_t ConvStrideW = conv_strides[1];
+    const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(i2);
+    const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(i3);

-    const index_t ConvDilationH = conv_dilations[0];
-    const index_t ConvDilationW = conv_dilations[1];
+    const index_t ConvStrideH = conv_strides[i0];
+    const index_t ConvStrideW = conv_strides[i1];

-    const index_t InLeftPadH  = in_left_pads[0];
-    const index_t InLeftPadW  = in_left_pads[1];
-    const index_t InRightPadH = in_right_pads[0];
-    const index_t InRightPadW = in_right_pads[1];
+    const index_t ConvDilationH = conv_dilations[i0];
+    const index_t ConvDilationW = conv_dilations[i1];
+
+    const index_t InLeftPadH  = in_left_pads[i0];
+    const index_t InLeftPadW  = in_left_pads[i1];
+    const index_t InRightPadH = in_right_pads[i0];
+    const index_t InRightPadW = in_right_pads[i1];

    // input tensor
    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
@@ -59,22 +64,23 @@ map_convolution_into_gemm_v1(const WeiDesc& wei_k_c_y_x_global_desc,
        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));

-    const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
-    const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
+    const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(i2);
+    const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(i3);

    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
        in_n_c_hip_wip_global_desc,
-        make_tuple(DynamicPassThrough{N},
+        make_tuple(
+            DynamicPassThrough{N},
            DynamicPassThrough{C},
-                   DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
-                   DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
+            DynamicEmbed<2>{make_multi_index(Y, Ho), make_multi_index(ConvDilationH, ConvStrideH)},
+            DynamicEmbed<2>{make_multi_index(X, Wo), make_multi_index(ConvDilationW, ConvStrideW)}),
        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));

-    const auto in_gemmk_gemmn_global_desc =
-        transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
-                                            make_tuple(DynamicMerge<3>{MultiIndex<3>{{C, Y, X}}},
-                                                       DynamicMerge<3>{MultiIndex<3>{{N, Ho, Wo}}}),
+    const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_y_ho_x_wo_global_desc,
+        make_tuple(DynamicMerge<3>{make_multi_index(C, Y, X)},
+                   DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
        make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
        make_tuple(Sequence<0>{}, Sequence<1>{}));


--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -183,7 +183,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                                               AddressSpace::Vgpr,
                                               AddressSpace::Lds,
                                               InMemoryDataOperation::Set>(
-                MultiIndex<4>{0, 0, b_block_data_on_global, 0}, MultiIndex<4>{0, 0, 0, 0});
+                MultiIndex<4>{{0, 0, b_block_data_on_global, 0}}, MultiIndex<4>{{0, 0, 0, 0}});

        // weight tensor
        //     global tensor in global memory, src of blockwise copy

--- a/composable_kernel/include/tensor_description/multi_index.hpp
+++ b/composable_kernel/include/tensor_description/multi_index.hpp
@@ -12,9 +12,8 @@ using MultiIndex = Array<index_t, N>;
 template <typename... Xs>
 __host__ __device__ constexpr auto make_multi_index(Xs... xs)
 {
-    return MultiIndex<sizeof...(Xs)>{{static_cast<index_t>(xs)...}};
+    return make_array<index_t>(xs...);
 }
-
 #else
 template <index_t N>
 using MultiIndex = StaticallyIndexedArray<index_t, N>;
@@ -33,5 +32,57 @@ __host__ __device__ constexpr auto make_zero_multi_index()
                  typename uniform_sequence_gen<NSize, 0>::type{});
 }

+template <typename T>
+__host__ __device__ constexpr auto to_multi_index(const T& x)
+{
+    return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
+}
+
+template <index_t NSize, typename X>
+__host__ __device__ constexpr auto operator+=(MultiIndex<NSize>& y, const X& x)
+{
+    static_assert(X::Size() == NSize, "wrong! size not the same");
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
+    return y;
+}
+
+template <index_t NSize, typename X>
+__host__ __device__ constexpr auto operator-=(MultiIndex<NSize>& y, const X& x)
+{
+    static_assert(X::Size() == NSize, "wrong! size not the same");
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
+    return y;
+}
+
+template <index_t NSize, typename T>
+__host__ __device__ constexpr auto operator+(const MultiIndex<NSize>& a, const T& b)
+{
+    using type = MultiIndex<NSize>;
+    static_assert(T::Size() == NSize, "wrong! size not the same");
+    type r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] + b[i]; });
+    return r;
+}
+
+template <index_t NSize, typename T>
+__host__ __device__ constexpr auto operator-(const MultiIndex<NSize>& a, const T& b)
+{
+    using type = MultiIndex<NSize>;
+    static_assert(T::Size() == NSize, "wrong! size not the same");
+    type r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] - b[i]; });
+    return r;
+}
+
+template <index_t NSize, typename T>
+__host__ __device__ constexpr auto operator*(const MultiIndex<NSize>& a, const T& b)
+{
+    using type = MultiIndex<NSize>;
+    static_assert(T::Size() == NSize, "wrong! size not the same");
+    type r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] * b[i]; });
+    return r;
+}
+
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -234,7 +234,7 @@ struct Merge
    {
        if(idx_up_diff[0] == 0)
        {
-            return make_zero_array<index_t, nDimLow>();
+            return make_zero_multi_index<nDimLow>();
        }
        else
        {
@@ -459,7 +459,7 @@ struct Embed
        for(index_t icorner = 0; icorner < ncorner; ++icorner)
        {
            // generate upper index for each corner
-            auto idx_up = make_zero_array<index_t, nDimUp>();
+            auto idx_up = make_zero_multi_index<nDimUp>();

            index_t itmp = icorner;

@@ -512,7 +512,7 @@ struct Freeze
                            const UpperIndex& /* idx_up_old */,
                            const LowerIndex& /* idx_low_old */)
    {
-        return make_zero_array<index_t, nDimLow>();
+        return make_zero_multi_index<nDimLow>();
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }

--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -270,7 +270,7 @@ struct TensorCoordinate
    MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
    {
        return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
-            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
+            make_zero_multi_index<TensorDesc::GetNumOfDimension()>());
    }

    template <typename... Ts>
@@ -278,7 +278,7 @@ struct TensorCoordinate
    MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
    {
        return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
-            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
+            make_zero_multi_index<TensorDesc::GetNumOfDimension()>());
    }

    public:

--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -317,7 +317,7 @@ struct TransformedTensorDescriptor
            // this assume each lower (single) index is only assocaited with one transformation,
            //   which is required for index transformation, and has been checked during constructor
            //   of TransformedTensorDescriptor
-            idx_low_part = tran.CalculateLowerIndex(to_array(idx_up_part));
+            idx_low_part = tran.CalculateLowerIndex(to_multi_index(idx_up_part));
        });

        return idx_low;
@@ -345,8 +345,9 @@ struct TransformedTensorDescriptor
            // this assume each lower (single) index is associated with only one transformation,
            //   which is required for index transformation, and has been checked during constructor
            //   of TransformedTensorDescriptor
-            idx_low_diff_part = tran.CalculateLowerIndexDiff(
-                to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
+            idx_low_diff_part = tran.CalculateLowerIndexDiff(to_multi_index(idx_up_diff_part),
+                                                             to_multi_index(idx_up_old_part),
+                                                             to_multi_index(idx_low_old_part));
        });

        return idx_low_diff;
@@ -506,7 +507,8 @@ struct TransformedTensorDescriptor
                constexpr auto low_dims_part = LowDimensionIds{}.At(itran);
                constexpr auto low_lengths_part =
                    GetLowerTensorDescriptor().GetLengths(low_dims_part);
-                const auto idx_low_part = to_array(pick_array_element(idx_low, low_dims_part));
+                const auto idx_low_part =
+                    to_multi_index(pick_array_element(idx_low, low_dims_part));

                for(index_t i = 0; i < low_dims_part.Size(); ++i)
                {

--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -68,9 +68,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
            const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{};

            mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
-            mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
+            mThreadwiseLoad.SetDstSliceOrigin(make_zero_multi_index<nDim>());

-            mThreadwiseStore.SetSrcSliceOrigin(make_zero_array<index_t, nDim>());
+            mThreadwiseStore.SetSrcSliceOrigin(make_zero_multi_index<nDim>());
            mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
        }
    }

--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -54,8 +54,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
    }

    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2()
-        : ThreadwiseGenericTensorSliceCopy_v4r2(make_zero_array<index_t, nDim>(),
-                                                make_zero_array<index_t, nDim>())
+        : ThreadwiseGenericTensorSliceCopy_v4r2(make_zero_multi_index<nDim>(),
+                                                make_zero_multi_index<nDim>())
    {
    }

@@ -104,7 +104,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                // load data from src to the long-vector buffer
                for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
                {
-                    auto scalar_id               = make_zero_array<index_t, nDim>();
+                    auto scalar_id               = make_zero_multi_index<nDim>();
                    scalar_id(vector_access_dim) = i * src_data_per_access;

                    const index_t buffer_offset = i * src_data_per_access;
@@ -143,7 +143,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                // store data from the long-vector buffer to dst
                for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
                {
-                    auto scalar_id               = make_zero_array<index_t, nDim>();
+                    auto scalar_id               = make_zero_multi_index<nDim>();
                    scalar_id(vector_access_dim) = i * dst_data_per_access;

                    const index_t buffer_offset = i * dst_data_per_access;
@@ -177,9 +177,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
    __device__ void MoveSrcSliceWindow(const T& step_sizes_,
                                       integral_constant<bool, PositiveDirection>)
    {
-        const auto step_sizes = to_array(step_sizes_);
+        const auto step_sizes = to_multi_index(step_sizes_);

-        static_if<PositiveDirection>{}([&](auto) { mSrcSliceOrigin += to_array(step_sizes); })
+        static_if<PositiveDirection>{}([&](auto) { mSrcSliceOrigin += to_multi_index(step_sizes); })
            .Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
    }

@@ -187,7 +187,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
    __device__ void MoveDstSliceWindow(const T& step_sizes_,
                                       integral_constant<bool, PositiveDirection>)
    {
-        const auto step_sizes = to_array(step_sizes_);
+        const auto step_sizes = to_multi_index(step_sizes_);

        static_if<PositiveDirection>{}([&](auto) { mDstSliceOrigin += step_sizes; })
            .Else([&](auto) { mDstSliceOrigin -= step_sizes; });

--- a/composable_kernel/include/utility/array.hpp
+++ b/composable_kernel/include/utility/array.hpp
@@ -9,49 +9,20 @@ namespace ck {
 template <typename TData, index_t NSize>
 struct Array
 {
-    using type      = Array<TData, NSize>;
+    using type      = Array;
    using data_type = TData;

-    // hack: add extra element to allow empty array
-    // TODO: implement empty Array
-    TData mData[NSize + 1] = {0};
+    TData mData[NSize] = {0};

    __host__ __device__ static constexpr index_t Size() { return NSize; }

-    // TODO: remove
-    __host__ __device__ static constexpr index_t GetSize() { return Size(); }
-
-    template <index_t I>
-    __host__ __device__ constexpr const TData& At(Number<I>) const
-    {
-        static_assert(I < NSize, "wrong!");
-
-        return mData[I];
-    }
-
-    template <index_t I>
-    __host__ __device__ constexpr TData& At(Number<I>)
-    {
-        static_assert(I < NSize, "wrong!");
-
-        return mData[I];
-    }
-
    __host__ __device__ constexpr const TData& At(index_t i) const { return mData[i]; }

    __host__ __device__ constexpr TData& At(index_t i) { return mData[i]; }

-    template <typename I>
-    __host__ __device__ constexpr const TData& operator[](I i) const
-    {
-        return At(i);
-    }
+    __host__ __device__ constexpr const TData& operator[](index_t i) const { return At(i); }

-    template <typename I>
-    __host__ __device__ constexpr TData& operator()(I i)
-    {
-        return At(i);
-    }
+    __host__ __device__ constexpr TData& operator()(index_t i) { return At(i); }

    template <typename T>
    __host__ __device__ constexpr auto operator=(const T& a)
@@ -62,127 +33,34 @@ struct Array

        return *this;
    }
+};

-    template <typename T>
-    __host__ __device__ constexpr auto operator+=(const T& a)
-    {
-        static_assert(T::Size() == Size(), "wrong! size not the same");
-
-        static_for<0, Size(), 1>{}([&](auto i) { operator()(i) += a[i]; });
-
-        return *this;
-    }
-
-    template <typename T>
-    __host__ __device__ constexpr auto operator-=(const T& a)
-    {
-        static_assert(T::Size() == Size(), "wrong! size not the same");
-
-        static_for<0, Size(), 1>{}([&](auto i) { operator()(i) -= a[i]; });
-
-        return *this;
-    }
-
-    template <typename T>
-    __host__ __device__ constexpr auto operator+(const T& a) const
-    {
-        static_assert(T::Size() == Size(), "wrong! size not the same");
-
-        type r;
-
-        static_for<0, Size(), 1>{}([&](auto i) { r(i) = operator[](i) + a[i]; });
-
-        return r;
-    }
-
-    template <typename T>
-    __host__ __device__ constexpr auto operator-(const T& a) const
-    {
-        static_assert(T::Size() == Size(), "wrong! size not the same");
-
-        type r;
-
-        static_for<0, Size(), 1>{}([&](auto i) { r(i) = operator[](i) - a[i]; });
-
-        return r;
-    }
-
-    template <typename T>
-    __host__ __device__ constexpr auto operator*(const T& a) const
-    {
-        static_assert(T::Size() == Size(), "wrong! size not the same");
-
-        type r;
-
-        static_for<0, Size(), 1>{}([&](auto i) { r(i) = operator[](i) * a[i]; });
-
-        return r;
-    }
-
-    struct lambda_PushBack // emulate constexpr lambda
-    {
-        const Array<TData, NSize>& old_array;
-        Array<TData, NSize + 1>& new_array;
-
-        __host__ __device__ constexpr lambda_PushBack(const Array<TData, NSize>& old_array_,
-                                                      Array<TData, NSize + 1>& new_array_)
-            : old_array(old_array_), new_array(new_array_)
-        {
-        }
-
-        template <index_t I>
-        __host__ __device__ constexpr void operator()(Number<I>) const
-        {
-            new_array(Number<I>{}) = old_array[I];
-        }
-    };
-
-    __host__ __device__ constexpr auto PushBack(TData x) const
-    {
-        Array<TData, NSize + 1> new_array;
-
-        static_for<0, NSize, 1>{}(lambda_PushBack(*this, new_array));
-
-        new_array(Number<NSize>{}) = x;
-
-        return new_array;
-    }
-
-    template <index_t NAppend>
-    __host__ __device__ constexpr auto Append(const Array<TData, NAppend>& xs) const
-    {
-        Array<TData, NSize + NAppend> r;
-
-        static_for<0, NSize, 1>{}([&r, this ](auto i) constexpr { r(i) = (*this)[i]; });
-
-        static_for<0, NAppend, 1>{}([&r, &xs ](auto i) constexpr { r(NSize + i) = xs[i]; });
+// empty Array
+template <typename TData>
+struct Array<TData, 0>
+{
+    using type      = Array;
+    using data_type = TData;

-        return r;
-    }
+    __host__ __device__ static constexpr index_t Size() { return 0; }
 };

 template <typename X, typename... Xs>
 __host__ __device__ constexpr auto make_array(const X& x, const Xs&... xs)
 {
-    return Array<X, sizeof...(xs) + 1>{{x, xs...}};
+    return Array<X, sizeof...(Xs) + 1>{{x, static_cast<X>(xs)...}};
 }

-template <typename T>
-__host__ __device__ constexpr auto to_array(const T& x)
+template <typename TData, index_t NSize>
+__host__ __device__ constexpr auto push_back(Array<TData, NSize>& a, const TData& x)
 {
-    Array<typename T::data_type, T::Size()> y;
+    Array<TData, NSize + 1> r;

-    static_for<0, T::Size(), 1>{}([&](auto i) { y.At(i) = x.At(i); });
+    static_for<0, NSize, 1>{}([&r, &a ](auto i) constexpr { r(i) = a[i]; });

-    return y;
-}
+    r(Number<NSize>{}) = x;

-template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto make_zero_array()
-{
-    constexpr auto zero_sequence = typename uniform_sequence_gen<NSize, 0>::type{};
-    constexpr auto zero_array    = to_array(zero_sequence);
-    return zero_array;
+    return r;
 }

 } // namespace ck

--- a/composable_kernel/include/utility/functional3.hpp
+++ b/composable_kernel/include/utility/functional3.hpp
@@ -63,7 +63,7 @@ struct ford_impl
        for(index_t i = 0; i < RemainLengths::Front(); ++i)
        {
            ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
-                f, current_ordered_id.PushBack(i));
+                f, push_back(current_ordered_id, i));
        }
    }
 };

--- a/composable_kernel/include/utility/statically_indexed_array.hpp
+++ b/composable_kernel/include/utility/statically_indexed_array.hpp
@@ -386,43 +386,5 @@ struct StaticallyIndexedArray<TData, 22> : Tuple<TData,
    using data_type = TData;
 };

-template <typename TData, index_t NSize, typename X>
-__host__ __device__ constexpr auto operator+=(StaticallyIndexedArray<TData, NSize>& y, const X& x)
-{
-    static_assert(X::Size() == NSize, "wrong! size not the same");
-    static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
-    return y;
-}
-
-template <typename TData, index_t NSize, typename X>
-__host__ __device__ constexpr auto operator-=(StaticallyIndexedArray<TData, NSize>& y, const X& x)
-{
-    static_assert(X::Size() == NSize, "wrong! size not the same");
-    static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
-    return y;
-}
-
-template <typename TData, index_t NSize, typename T>
-__host__ __device__ constexpr auto operator+(const StaticallyIndexedArray<TData, NSize>& a,
-                                             const T& b)
-{
-    using type = StaticallyIndexedArray<TData, NSize>;
-    static_assert(T::Size() == NSize, "wrong! size not the same");
-    type r;
-    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] + b[i]; });
-    return r;
-}
-
-template <typename TData, index_t NSize, typename T>
-__host__ __device__ constexpr auto operator-(const StaticallyIndexedArray<TData, NSize>& a,
-                                             const T& b)
-{
-    using type = StaticallyIndexedArray<TData, NSize>;
-    static_assert(T::Size() == NSize, "wrong! size not the same");
-    type r;
-    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] - b[i]; });
-    return r;
-}
-
 } // namespace ck
 #endif
--- a/driver/include/device_dummy_dynamic_transform_v2.hpp
+++ b/driver/include/device_dummy_dynamic_transform_v2.hpp
@@ -29,16 +29,16 @@ void device_dummy_dynamic_transform_v2(InDesc,
    using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;

    const auto in_nchw_desc = make_dynamic_native_tensor_descriptor_v2(
-        to_array(InDesc::GetLengths()), to_array(InDesc::GetStrides()));
+        to_multi_index(InDesc::GetLengths()), to_multi_index(InDesc::GetStrides()));
    const auto wei_kcyx_desc = make_dynamic_native_tensor_descriptor_v2(
-        to_array(WeiDesc::GetLengths()), to_array(WeiDesc::GetStrides()));
+        to_multi_index(WeiDesc::GetLengths()), to_multi_index(WeiDesc::GetStrides()));
    const auto out_nkhw_desc = make_dynamic_native_tensor_descriptor_v2(
-        to_array(OutDesc::GetLengths()), to_array(OutDesc::GetStrides()));
+        to_multi_index(OutDesc::GetLengths()), to_multi_index(OutDesc::GetStrides()));

-    const auto conv_strides   = to_array(ConvStrides{});
-    const auto conv_dilations = to_array(ConvDilations{});
-    const auto in_left_pads   = to_array(InLeftPads{});
-    const auto in_right_pads  = to_array(InRightPads{});
+    const auto conv_strides   = to_multi_index(ConvStrides{});
+    const auto conv_dilations = to_multi_index(ConvDilations{});
+    const auto in_left_pads   = to_multi_index(InLeftPads{});
+    const auto in_right_pads  = to_multi_index(InRightPads{});

    const auto tensor_descs = map_convolution_into_gemm_v2(wei_kcyx_desc,
                                                           in_nchw_desc,

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -549,7 +549,7 @@ int main(int argc, char* argv[])
 #endif
    }

-#if 0
+#if 1
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,