overhauling DynamicTensorDescriptor and dynamic multi-index transform in...

overhauling DynamicTensorDescriptor and dynamic multi-index transform in preparation for partially compile-time and partially run-time tensor descriptor

overhauling DynamicTensorDescriptor and dynamic multi-index transform in...
overhauling DynamicTensorDescriptor and dynamic multi-index transform in preparation for partially compile-time and partially run-time tensor descriptor
3990522d · Chao Liu · 2dea900b · 3990522d · 3990522d · 3990522d
Commit 3990522d authored Feb 17, 2021 by Chao Liu
5 changed files
--- a/composable_kernel/include/tensor_description/array_multi_index.hpp
+++ b/composable_kernel/include/tensor_description/array_multi_index.hpp
+#ifndef CK_ARRAY_MULTI_INDEX_HPP
+#define CK_ARRAY_MULTI_INDEX_HPP
+
+#include "common_header.hpp"
+
+namespace ck {
+
+template <index_t N>
+using MultiIndex = Array<index_t, N>;
+
+template <typename... Xs>
+__host__ __device__ constexpr auto make_multi_index(Xs&&... xs)
+{
+    return make_array<index_t>(index_t{xs}...);
+}
+
+template <index_t NSize>
+__host__ __device__ constexpr auto make_zero_multi_index()
+{
+    return unpack([](auto... xs) { return make_multi_index(xs...); },
+                  typename uniform_sequence_gen<NSize, 0>::type{});
+}
+
+template <typename T>
+__host__ __device__ constexpr auto to_multi_index(const T& x)
+{
+    return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
+}
+
+template <index_t NSize, typename X>
+__host__ __device__ constexpr auto operator+=(MultiIndex<NSize>& y, const X& x)
+{
+    static_assert(X::Size() == NSize, "wrong! size not the same");
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
+    return y;
+}
+
+template <index_t NSize, typename X>
+__host__ __device__ constexpr auto operator-=(MultiIndex<NSize>& y, const X& x)
+{
+    static_assert(X::Size() == NSize, "wrong! size not the same");
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
+    return y;
+}
+
+template <index_t NSize, typename T>
+__host__ __device__ constexpr auto operator+(const MultiIndex<NSize>& a, const T& b)
+{
+    using type = MultiIndex<NSize>;
+    static_assert(T::Size() == NSize, "wrong! size not the same");
+    type r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] + b[i]; });
+    return r;
+}
+
+template <index_t NSize, typename T>
+__host__ __device__ constexpr auto operator-(const MultiIndex<NSize>& a, const T& b)
+{
+    using type = MultiIndex<NSize>;
+    static_assert(T::Size() == NSize, "wrong! size not the same");
+    type r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] - b[i]; });
+    return r;
+}
+
+template <index_t NSize, typename T>
+__host__ __device__ constexpr auto operator*(const MultiIndex<NSize>& a, const T& b)
+{
+    using type = MultiIndex<NSize>;
+    static_assert(T::Size() == NSize, "wrong! size not the same");
+    type r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] * b[i]; });
+    return r;
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -326,6 +326,7 @@ struct DynamicRightPad
    }
 };

+#if 0
 // idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1]
 template <index_t NDimUp>
 struct DynamicEmbed
@@ -413,6 +414,103 @@ struct DynamicEmbed
        printf("}");
    }
 };
+#else
+// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1]
+// UpLengths and Coefficients can be either of the followings:
+//   1) Tuple of index_t, which is known at run-time, or
+//   2) Tuple of Number, which is known at compile-time, or
+//   3) Tuple of mixture of index_t and Number, which is known partially at run-time and partially
+//   at compile-time
+template <index_t NDimUp,
+          typename UpLengths                  = MultiIndex<NDimUp>,
+          typename Coefficients               = MultiIndex<NDimUp>,
+          typename std::enable_if<UpLengths::Size() == NDimUp && Coefficients::Size() == NDimUp,
+                                  bool>::type = false>
+struct DynamicEmbed
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<NDimUp>;
+
+    UpLengths up_lengths_;
+    Coefficients coefficients_;
+
+    __host__ __device__ constexpr DynamicEmbed() = default;
+
+    __host__ __device__ constexpr DynamicEmbed(const UpperIndex& up_lengths,
+                                               const UpperIndex& coefficients)
+        : up_lengths_{up_lengths}, coefficients_{coefficients}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = 0;
+
+        static_for<0, NDimUp, 1>{}([&idx_low, &idx_up, this](auto i) {
+            idx_low(Number<0>{}) += idx_up[i] * this->coefficients_[i];
+        });
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == NDimUp &&
+                          LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
+                      "wrong! inconsistent # of dimension");
+
+        idx_diff_low(Number<0>{}) = 0;
+
+        static_for<0, NDimUp, 1>{}(
+            [&](auto i) { idx_diff_low(Number<0>{}) += idx_diff_up[i] * coefficients_[i]; });
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicEmbed, ");
+        printf("up_lengths_ ");
+        // print_multi_index(up_lengths_);
+        printf("coefficients_ ");
+        // print_multi_index(coefficients_);
+        printf("}");
+    }
+};
+#endif

 template <index_t NDimLow>
 struct DynamicMerge
@@ -922,6 +1020,7 @@ struct DynamicMerge
    }
 };

+#if 0
 template <index_t NDimUp, bool Use24BitIntegerCalculation = false>
 struct DynamicUnMerge
 {
@@ -1009,6 +1108,100 @@ struct DynamicUnMerge
        printf("}");
    }
 };
+#else
+template <index_t NDimUp,
+          bool Use24BitIntegerCalculation     = false,
+          typename UpLengths                  = MultiIndex<NDimUp>,
+          typename UpLengthsScan              = MultiIndex<NDimUp>,
+          typename std::enable_if<UpLengths::Size() == NDimUp && UpLengthsScan::Size() == NDimUp,
+                                  bool>::type = false>
+struct DynamicUnMerge
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<NDimUp>;
+
+    UpLengths up_lengths_;
+    UpLengthsScan up_lengths_scan_;
+
+    __host__ __device__ constexpr DynamicUnMerge() = default;
+
+    __host__ __device__ constexpr DynamicUnMerge(const UpperIndex& up_lengths)
+        : up_lengths_{up_lengths},
+          up_lengths_scan_{
+              container_reverse_exclusive_scan(up_lengths, math::multiplies<index_t>(), index_t{1})}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+
+        if constexpr(!Use24BitIntegerCalculation)
+        {
+            idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
+
+            static_for<0, NDimUp - 1, 1>{}(
+                [&](auto i) { idx_low(Number<0>{}) += idx_up[i] * up_lengths_scan_[i]; });
+        }
+        else
+        {
+            idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
+
+            static_for<0, NDimUp - 1, 1>{}([&](auto i) {
+                idx_low(Number<0>{}) =
+                    (0x00ffffff & idx_low[Number<0>{}]) +
+                    (0x00ffffff & idx_up[i]) * (0x00ffffff & up_lengths_scan_[i]);
+            });
+        }
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+        CalculateLowerIndex(idx_diff_low, idx_diff_up);
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicUnMerge, ");
+        // print_multi_index(up_lengths_);
+        // print_multi_index(up_lengths_scan_);
+        printf("}");
+    }
+};
+#endif

 struct DynamicFreeze
 {

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
@@ -12,6 +12,25 @@ struct DynamicTensorCoordinate;
 template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
 struct DynamicTensorCoordinateIterator;

+template <typename LowerDimensionIdss, typename UpperDimensionIdss>
+__host__ __device__ constexpr index_t GetNumOfHiddenDimension(LowerDimensionIdss,
+                                                              UpperDimensionIdss)
+{
+    constexpr auto all_low_dim_ids =
+        unpack([](auto&&... xs) constexpr { return merge_sequences(xs...); }, LowerDimensionIdss{});
+
+    constexpr auto all_up_dim_ids =
+        unpack([](auto&&... xs) constexpr { return merge_sequences(xs...); }, UpperDimensionIdss{});
+
+    constexpr auto all_dim_ids = merge_sequences(all_low_dim_ids, all_up_dim_ids);
+
+    using unique_sort_all_dim_ids = typename sequence_unique_sort<decltype(all_dim_ids),
+                                                                  math::less<index_t>,
+                                                                  math::equal<index_t>>::type;
+
+    return unique_sort_all_dim_ids::Size();
+}
+
 // Transforms: Tuple<transforms...>
 // LowerDimensionIdss : Tuple<Sequence<...>, ...>
 // UpperDimensionIdss : Tuple<Sequence<...>, ...>
@@ -19,7 +38,9 @@ struct DynamicTensorCoordinateIterator;
 template <typename Transforms,
          typename LowerDimensionIdss,
          typename UpperDimensionIdss,
-          typename VisibleDimensionIds>
+          typename VisibleDimensionIds,
+          typename ElementSize      = index_t,
+          typename ElementSpaceSize = index_t>
 struct DynamicTensorDescriptor
 {
    // TODO make these private
@@ -63,7 +84,9 @@ struct DynamicTensorDescriptor
    __host__ __device__ constexpr DynamicTensorDescriptor(const Transforms& transforms,
                                                          index_t element_space_size)
        : transforms_{transforms},
-          hidden_lengths_{InitializeHiddenLengths(transforms_, element_space_size)}
+          element_size_{InitializeElementSize(transforms)},
+          element_space_size_{element_space_size}
+
    {
        static_assert(Transforms::Size() == ntransform_ &&
                          LowerDimensionIdss::Size() == ntransform_ &&
@@ -79,24 +102,27 @@ struct DynamicTensorDescriptor
    }

    template <index_t IDim>
-    __host__ __device__ constexpr index_t GetLength(Number<IDim>) const
+    __host__ __device__ constexpr auto GetLength(Number<IDim>) const
    {
-        return hidden_lengths_[VisibleDimensionIds::At(Number<IDim>{})];
-    }
+        static_assert(IDim >= 0 && IDim < ndim_visible_, "wrong! out of range");

-    __host__ __device__ constexpr auto GetLengths() const
-    {
-        return get_container_subset(hidden_lengths_, VisibleDimensionIds{});
-    }
+        constexpr auto tmp = FindTransformAndItsUpperDimension(Number<IDim>{});

-    __host__ __device__ constexpr index_t GetElementSize() const
-    {
-        return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
+        constexpr index_t itran   = tmp[Number<0>{}];
+        constexpr index_t idim_up = tmp[Number<1>{}];
+        constexpr bool found      = tmp[Number<2>{}];
+
+        static_assert(found == true,
+                      "wrong! not found matching transformation and upper-dimension");
+
+        return transforms_[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
    }

-    __host__ __device__ constexpr index_t GetElementSpaceSize() const
+    __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
+
+    __host__ __device__ constexpr auto GetElementSpaceSize() const
    {
-        return hidden_lengths_[Number<0>{}];
+        return element_space_size_;
    }

    template <typename Idx>
@@ -125,25 +151,55 @@ struct DynamicTensorDescriptor
        return VisibleDimensionIds{};
    }

-    __host__ __device__ static constexpr auto InitializeHiddenLengths(const Transforms& transforms,
-                                                                      index_t element_space_size)
+    __host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms)
+    {
+        const auto lengths = generate_tuple(
+            [&](auto idim_visible) {
+                constexpr auto tmp = FindTransformAndItsUpperDimension(idim_visible);
+
+                constexpr index_t itran   = tmp[Number<0>{}];
+                constexpr index_t idim_up = tmp[Number<1>{}];
+                constexpr bool found      = tmp[Number<2>{}];
+
+                static_assert(found == true,
+                              "wrong! not found matching transformation and upper-dimension");
+
+                const auto length =
+                    transforms[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
+
+                return length;
+            },
+            Number<ndim_visible_>{});
+
+        // TODO: make container_reduce support tuple of Number and index_t
+        return container_reduce(lengths, math::multiplies<index_t>{}, index_t{1});
+    }
+
+    template <index_t IDim>
+    __host__ __device__ static constexpr auto FindTransformAndItsUpperDimension(Number<IDim>)
    {
-        // zero initialization
-        HiddenIndex hidden_lengths = make_zero_multi_index<ndim_hidden_>();
+        constexpr auto idim_visible = Number<IDim>{};

-        // this is the orignal tensor element space size
-        hidden_lengths(Number<0>{}) = element_space_size;
+        constexpr index_t idim_hidden = VisibleDimensionIds::At(idim_visible);

-        // lengths for all other hidden dimensions
-        static_for<0, ntransform_, 1>{}([&transforms, &hidden_lengths](auto itran) {
-            const auto& tran = transforms.At(itran);
+        index_t itran_found   = 0;
+        index_t idim_up_found = 0;
+        bool found            = false;

-            constexpr auto up_dim_ids = UpperDimensionIdss{}.At(itran);
+        static_for<0, ntransform_, 1>{}([&](auto itran) {
+            constexpr auto up_dim_ids = UpperDimensionIdss{}[itran];

-            set_container_subset(hidden_lengths, up_dim_ids, tran.GetUpperLengths());
+            static_for<0, up_dim_ids.Size(), 1>{}([&](auto idim_up) {
+                if constexpr(up_dim_ids[idim_up] == idim_hidden)
+                {
+                    itran_found   = itran;
+                    idim_up_found = idim_up;
+                    found         = true;
+                }
+            });
        });

-        return hidden_lengths;
+        return make_tuple(itran_found, idim_up_found, found);
    }

    __host__ __device__ void Print() const
@@ -165,9 +221,8 @@ struct DynamicTensorDescriptor

    // TODO make these private
    Transforms transforms_;
-    // TODO maybe hidden_lengths_ should use reference_wrapper (reference to transforms_'s member
-    //  variable lengths_) to save space on stack?
-    HiddenIndex hidden_lengths_;
+    ElementSize element_size_;
+    ElementSpaceSize element_space_size_;
 };

 template <index_t NDimHidden, typename VisibleDimensionIds>

--- a/composable_kernel/include/tensor_description/multi_index.hpp
+++ b/composable_kernel/include/tensor_description/multi_index.hpp
@@ -3,176 +3,10 @@

 #include "common_header.hpp"

-namespace ck {
-
 #if CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX
-template <index_t N>
-using MultiIndex = Array<index_t, N>;
-
-template <typename... Xs>
-__host__ __device__ constexpr auto make_multi_index(Xs&&... xs)
-{
-    return make_array<index_t>(index_t{xs}...);
-}
-
-template <index_t NSize>
-__host__ __device__ constexpr auto make_zero_multi_index()
-{
-    return unpack([](auto... xs) { return make_multi_index(xs...); },
-                  typename uniform_sequence_gen<NSize, 0>::type{});
-}
-
-template <typename T>
-__host__ __device__ constexpr auto to_multi_index(const T& x)
-{
-    return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
-}
-
-template <index_t NSize, typename X>
-__host__ __device__ constexpr auto operator+=(MultiIndex<NSize>& y, const X& x)
-{
-    static_assert(X::Size() == NSize, "wrong! size not the same");
-    static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
-    return y;
-}
-
-template <index_t NSize, typename X>
-__host__ __device__ constexpr auto operator-=(MultiIndex<NSize>& y, const X& x)
-{
-    static_assert(X::Size() == NSize, "wrong! size not the same");
-    static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
-    return y;
-}
-
-template <index_t NSize, typename T>
-__host__ __device__ constexpr auto operator+(const MultiIndex<NSize>& a, const T& b)
-{
-    using type = MultiIndex<NSize>;
-    static_assert(T::Size() == NSize, "wrong! size not the same");
-    type r;
-    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] + b[i]; });
-    return r;
-}
-
-template <index_t NSize, typename T>
-__host__ __device__ constexpr auto operator-(const MultiIndex<NSize>& a, const T& b)
-{
-    using type = MultiIndex<NSize>;
-    static_assert(T::Size() == NSize, "wrong! size not the same");
-    type r;
-    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] - b[i]; });
-    return r;
-}
-
-template <index_t NSize, typename T>
-__host__ __device__ constexpr auto operator*(const MultiIndex<NSize>& a, const T& b)
-{
-    using type = MultiIndex<NSize>;
-    static_assert(T::Size() == NSize, "wrong! size not the same");
-    type r;
-    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] * b[i]; });
-    return r;
-}
-
+#include "array_multi_index.hpp"
 #else
-template <index_t N>
-using MultiIndex = StaticallyIndexedArray<index_t, N>;
-
-template <typename... Xs>
-__host__ __device__ constexpr auto make_multi_index(Xs&&... xs)
-{
-    return make_statically_indexed_array<index_t>(index_t{xs}...);
-}
-
-template <index_t NSize>
-__host__ __device__ constexpr auto make_zero_multi_index()
-{
-    return unpack([](auto... xs) { return make_multi_index(xs...); },
-                  typename uniform_sequence_gen<NSize, 0>::type{});
-}
-
-template <typename T>
-__host__ __device__ constexpr auto to_multi_index(const T& x)
-{
-    return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
-}
-
-// Here should use MultiIndex<NSize>, instead of Tuple<Ys...>, although the former
-// is the alias of the latter. This is because compiler cannot infer the NSize if
-// using MultiIndex<NSize>
-// TODO: how to fix this?
-template <typename... Ys, typename X>
-__host__ __device__ constexpr auto operator+=(Tuple<Ys...>& y, const X& x)
-{
-    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
-    constexpr index_t NSize = sizeof...(Ys);
-    static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
-    return y;
-}
-
-template <typename... Ys, typename X>
-__host__ __device__ constexpr auto operator-=(Tuple<Ys...>& y, const X& x)
-{
-    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
-    constexpr index_t NSize = sizeof...(Ys);
-    static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
-    return y;
-}
-
-template <typename... Xs, typename Y>
-__host__ __device__ constexpr auto operator+(const Tuple<Xs...>& x, const Y& y)
-{
-    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
-    constexpr index_t NSize = sizeof...(Xs);
-
-    Tuple<Xs...> r;
-    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] + y[i]; });
-    return r;
-}
-
-template <typename... Xs, typename Y>
-__host__ __device__ constexpr auto operator-(const Tuple<Xs...>& x, const Y& y)
-{
-    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
-    constexpr index_t NSize = sizeof...(Xs);
-
-    Tuple<Xs...> r;
-    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] - y[i]; });
-    return r;
-}
-
-template <typename... Xs, typename Y>
-__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
-{
-    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
-    constexpr index_t NSize = sizeof...(Xs);
-
-    Tuple<Xs...> r;
-    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] * y[i]; });
-    return r;
-}
-
-// MultiIndex = index_t * MultiIndex
-template <typename... Xs>
-__host__ __device__ constexpr auto operator*(index_t a, const Tuple<Xs...>& x)
-{
-    constexpr index_t NSize = sizeof...(Xs);
-
-    Tuple<Xs...> r;
-    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a * x[i]; });
-    return r;
-}
-
-template <typename... Xs>
-__host__ __device__ void print_multi_index(const Tuple<Xs...>& x)
-{
-    printf("{");
-    printf("MultiIndex, ");
-    printf("size %d,", index_t{sizeof...(Xs)});
-    static_for<0, sizeof...(Xs), 1>{}([&](auto i) { printf("%d ", x.At(i)); });
-    printf("}");
-}
-
+#include "statically_indexed_array_multi_index.hpp"
 #endif
-} // namespace ck
+
 #endif
--- a/composable_kernel/include/tensor_description/statically_indexed_array_multi_index.hpp
+++ b/composable_kernel/include/tensor_description/statically_indexed_array_multi_index.hpp
+#ifndef CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
+#define CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
+
+#include "common_header.hpp"
+
+namespace ck {
+
+template <index_t N>
+using MultiIndex = StaticallyIndexedArray<index_t, N>;
+
+template <typename... Xs>
+__host__ __device__ constexpr auto make_multi_index(Xs&&... xs)
+{
+    return make_statically_indexed_array<index_t>(index_t{xs}...);
+}
+
+template <index_t NSize>
+__host__ __device__ constexpr auto make_zero_multi_index()
+{
+    return unpack([](auto... xs) { return make_multi_index(xs...); },
+                  typename uniform_sequence_gen<NSize, 0>::type{});
+}
+
+template <typename T>
+__host__ __device__ constexpr auto to_multi_index(const T& x)
+{
+    return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
+}
+
+// Here should use MultiIndex<NSize>, instead of Tuple<Ys...>, although the former
+// is the alias of the latter. This is because compiler cannot infer the NSize if
+// using MultiIndex<NSize>
+// TODO: how to fix this?
+template <typename... Ys, typename X>
+__host__ __device__ constexpr auto operator+=(Tuple<Ys...>& y, const X& x)
+{
+    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Ys);
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
+    return y;
+}
+
+template <typename... Ys, typename X>
+__host__ __device__ constexpr auto operator-=(Tuple<Ys...>& y, const X& x)
+{
+    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Ys);
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
+    return y;
+}
+
+template <typename... Xs, typename Y>
+__host__ __device__ constexpr auto operator+(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] + y[i]; });
+    return r;
+}
+
+template <typename... Xs, typename Y>
+__host__ __device__ constexpr auto operator-(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] - y[i]; });
+    return r;
+}
+
+template <typename... Xs, typename Y>
+__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] * y[i]; });
+    return r;
+}
+
+// MultiIndex = index_t * MultiIndex
+template <typename... Xs>
+__host__ __device__ constexpr auto operator*(index_t a, const Tuple<Xs...>& x)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a * x[i]; });
+    return r;
+}
+
+template <typename... Xs>
+__host__ __device__ void print_multi_index(const Tuple<Xs...>& x)
+{
+    printf("{");
+    printf("MultiIndex, ");
+    printf("size %d,", index_t{sizeof...(Xs)});
+    static_for<0, sizeof...(Xs), 1>{}([&](auto i) { printf("%d ", x.At(i)); });
+    printf("}");
+}
+
+} // namespace ck
+#endif