refactored dynamically indexed array works now

34cbbb48 · Chao Liu · 4d70c71b · 34cbbb48 · 34cbbb48 · 34cbbb48
Commit 34cbbb48 authored Sep 30, 2020 by Chao Liu
18 changed files
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v1.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v1.hpp
@@ -97,10 +97,10 @@ struct DummyDynamicTransform_v1
                          const WeiDesc wei_k_c_y_x_global_desc,
                          const InDesc in_n_c_hi_wi_global_desc,
                          const OutDesc out_n_k_ho_wo_global_desc,
-                          const MultiIndex<2>& conv_strides,
-                          const MultiIndex<2>& conv_dilations,
-                          const MultiIndex<2>& in_left_pads,
-                          const MultiIndex<2>& in_right_pads) const
+                          const MultiIndex<2> conv_strides,
+                          const MultiIndex<2> conv_dilations,
+                          const MultiIndex<2> in_left_pads,
+                          const MultiIndex<2> in_right_pads) const
    {
        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -542,10 +542,10 @@ struct DummyDynamicTransform_v1
                          const WeiDesc wei_k_c_y_x_global_desc,
                          const InDesc in_n_c_hi_wi_global_desc,
                          const OutDesc out_n_k_ho_wo_global_desc,
-                          const MultiIndex<2>& conv_strides,
-                          const MultiIndex<2>& conv_dilations,
-                          const MultiIndex<2>& in_left_pads,
-                          const MultiIndex<2>& in_right_pads) const
+                          const MultiIndex<2> conv_strides,
+                          const MultiIndex<2> conv_dilations,
+                          const MultiIndex<2> in_left_pads,
+                          const MultiIndex<2> in_right_pads) const
    {
        const auto transformed_tensor_descs =
            map_convolution_into_gemm_v1(wei_k_c_y_x_global_desc,
@@ -564,7 +564,8 @@ struct DummyDynamicTransform_v1

        const index_t niter = p_wei_global[10];

-        auto in_gemmk_gemmn_coord = make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, idx);
+        auto in_gemmk_gemmn_coord =
+            make_dynamic_tensor_coordinate<2>(in_gemmk_gemmn_global_desc, idx);

        for(index_t iter = 0; iter < niter; ++iter)
        {
@@ -587,7 +588,7 @@ struct DummyDynamicTransform_v1
                             1,
                             p_out_global,
                             in_gemmk_gemmn_coord.GetOffset(),
-#if 0
+#if 1
                             in_gemmk_gemmn_coord.IsOffsetValidAssumingUpperIndexIsValid(),
 #else
                             true,
@@ -603,10 +604,10 @@ struct DummyDynamicTransform_v1
                        const WeiDesc wei_k_c_y_x_global_desc,
                        const InDesc in_n_c_hi_wi_global_desc,
                        const OutDesc out_n_k_ho_wo_global_desc,
-                        const MultiIndex<2>& conv_strides,
-                        const MultiIndex<2>& conv_dilations,
-                        const MultiIndex<2>& in_left_pads,
-                        const MultiIndex<2>& in_right_pads) const
+                        const MultiIndex<2> conv_strides,
+                        const MultiIndex<2> conv_dilations,
+                        const MultiIndex<2> in_left_pads,
+                        const MultiIndex<2> in_right_pads) const
    {
        Run_2(p_wei_global,
              p_in_global,

--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform_v2.hpp
@@ -117,19 +117,41 @@ struct DummyDynamicTransform_v2_1
        // initialize idx
        static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });

+        const index_t niter = p_wei_global[10];
+
        auto in_gemmk_gemmn_coord =
            make_dynamic_tensor_coordinate_v2(in_gemmk_gemmn_global_desc, idx);

        const auto in_gemmk_gemmn_coord_step = make_dynamic_tensor_coordinate_step_v2(
            in_gemmk_gemmn_global_desc, make_multi_index(1, 0));

-#pragma unroll 1
-        for(index_t i = 0; i < 10; ++i)
+        for(index_t iter = 0; iter < niter; ++iter)
        {
            move_dynamic_tensor_coordinate_v2(
                in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_step);

-            p_out_global[in_gemmk_gemmn_coord.GetOffset()] = 1;
+            // write
+            float value = 1;
+
+            transfer_data<float,
+                          1,
+                          AddressSpace::Vgpr,
+                          AddressSpace::Global,
+                          InMemoryDataOperation::Set,
+                          1,
+                          1>(&value,
+                             0,
+                             true,
+                             1,
+                             p_out_global,
+                             in_gemmk_gemmn_coord.GetOffset(),
+#if 1
+                             coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                                 in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
+#else
+                             true,
+#endif
+                             in_gemmk_gemmn_global_desc.GetElementSpaceSize());
        }
    }

@@ -215,13 +237,13 @@ struct DummyDynamicTransform_v2_1
        // initialize idx
        static_for<0, 4, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });

-#if 0
+#if 1
        const index_t niter = p_wei_global[10];

        auto in_coord = make_dynamic_tensor_coordinate_v2(in_n_c_hip_wip_global_desc, idx);

        const auto in_coord_step = make_dynamic_tensor_coordinate_step_v2(
-            in_n_c_hip_wip_global_desc, MultiIndex<4>{{1, 0, 0, 0}});
+            in_n_c_hip_wip_global_desc, make_multi_index(1, 0, 0, 0));

        for(index_t iter = 0; iter < niter; ++iter)
        {
@@ -266,7 +288,7 @@ struct DummyDynamicTransform_v2_1
                        const MultiIndex<2> in_left_pads,
                        const MultiIndex<2> in_right_pads) const
    {
-        Run_1(p_wei_global,
+        Run_2(p_wei_global,
              p_in_global,
              p_out_global,
              wei_k_c_y_x_global_desc,
@@ -293,7 +315,6 @@ struct DummyDynamicTransform_v2_2
        // initialize idx
        static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });

-#if 1
        const index_t niter = p_wei_global[10];

        auto in_gemmk_gemmn_coord =
@@ -322,13 +343,14 @@ struct DummyDynamicTransform_v2_2
                             1,
                             p_out_global,
                             in_gemmk_gemmn_coord.GetOffset(),
+#if 0
                             coordinate_has_valid_offset_assuming_visible_index_is_valid(
                                 in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
-                             in_gemmk_gemmn_global_desc.GetElementSpaceSize());
-        }
 #else
-        p_out_global[in_gemmk_gemmn_global_desc.CalculateOffset(idx)] = 1;
+                             true,
 #endif
+                             in_gemmk_gemmn_global_desc.GetElementSpaceSize());
+        }
    }
 };


--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -371,10 +371,10 @@ struct DynamicMerge

    __host__ __device__ constexpr DynamicMerge(const LowerIndex& low_lengths)
        : low_lengths_{low_lengths},
-          low_lengths_scan_{reverse_exclusive_scan_on_array<index_t, NDimLow>(
+          low_lengths_scan_{container_reverse_exclusive_scan<index_t, NDimLow>(
              low_lengths, math::multiplies<index_t>{}, index_t{1})},
          up_lengths_{make_multi_index(
-              reduce_on_array(low_lengths, math::multiplies<index_t>(), index_t{1}))}
+              container_reduce(low_lengths, math::multiplies<index_t>(), index_t{1}))}
    {
        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
    }
@@ -516,7 +516,7 @@ struct DynamicUnMerge
    __host__ __device__ constexpr DynamicUnMerge(const UpperIndex& up_lengths)
        : up_lengths_{up_lengths},
          up_lengths_scan_{
-              reverse_exclusive_scan_on_array(up_lengths, math::multiplies<index_t>(), index_t{1})}
+              container_reverse_exclusive_scan(up_lengths, math::multiplies<index_t>(), index_t{1})}
    {
    }


--- a/composable_kernel/include/tensor_description/dynamic_tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_coordinate.hpp
@@ -263,7 +263,7 @@ struct DynamicTensorCoordinate
 {
    static constexpr index_t NDim = TensorDesc::GetNumOfDimension();

-    using type = decltype(make_dynamic_tensor_coordinate(TensorDesc{}, MultiIndex<NDim>{}));
+    using type = decltype(make_dynamic_tensor_coordinate<NDim>(TensorDesc{}, MultiIndex<NDim>{}));
 };

 } // namespace ck

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
@@ -45,7 +45,7 @@ struct DynamicNativeTensorDescriptor

    __host__ __device__ constexpr index_t GetElementSize() const
    {
-        return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
+        return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
    }

    __host__ __device__ constexpr index_t GetElementSpace() const
@@ -136,7 +136,7 @@ struct DynamicTransformedTensorDescriptor
        template <typename... Xs>
        __host__ __device__ constexpr auto operator()(Xs... xs) const
        {
-            return array_cat(xs...);
+            return container_cat(xs...);
        }
    };

@@ -211,21 +211,22 @@ struct DynamicTransformedTensorDescriptor
        const auto unsorted_up_lengths = unpack(lambda_merge_arrays{}, tuple_of_up_lengths);

        const auto sorted_up_lengths =
-            reorder_array_given_new2old(unsorted_up_lengths, sorted2unsorted_map);
+            container_reorder_given_new2old(unsorted_up_lengths, sorted2unsorted_map);

        return sorted_up_lengths;
    }

    __host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }

-    __host__ __device__ constexpr index_t GetLength(index_t idim) const
+    template <index_t IDim>
+    __host__ __device__ constexpr index_t GetLength(Number<IDim>) const
    {
-        return GetLengths()[idim];
+        return GetLengths()[Number<IDim>{}];
    }

    __host__ __device__ constexpr index_t GetElementSize() const
    {
-        return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
+        return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
    }

    __host__ __device__ constexpr index_t GetElementSpace() const

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v2.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v2.hpp
@@ -18,7 +18,7 @@ make_dynamic_native_tensor_descriptor_packed_v2(const MultiIndex<N>& lengths)
    constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};

    const index_t element_space_size =
-        reduce_on_array(lengths, math::multiplies<index_t>{}, index_t{1});
+        container_reduce(lengths, math::multiplies<index_t>{}, index_t{1});

    return DynamicTensorDescriptor_v2<decltype(transforms),
                                      decltype(low_dim_hidden_idss),

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
@@ -116,7 +116,7 @@ struct DynamicTensorDescriptor_v2
    // maybe this result should be saved as a member variable
    __host__ __device__ constexpr index_t GetElementSize() const
    {
-        return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
+        return container_reduce(GetLengths(), math::multiplies<index_t>{}, index_t{1});
    }

    __host__ __device__ constexpr index_t GetElementSpaceSize() const
@@ -411,13 +411,13 @@ transform_dynamic_tensor_descriptor_v2(const OldTensorDescriptor& old_tensor_des
        unordered_new_visible_dim_hidden_ids.ReorderGivenOld2New(new_visible_dim_unordered2ordered);

    // put everything together
-    const auto all_transforms = tuple_cat(old_tensor_desc.GetTransforms(), new_transforms);
+    const auto all_transforms = container_cat(old_tensor_desc.GetTransforms(), new_transforms);

    constexpr auto all_low_dim_hidden_idss =
-        tuple_cat(OldTensorDescriptor::GetLowerDimensionIdss(), low_dim_hidden_idss);
+        container_cat(OldTensorDescriptor::GetLowerDimensionIdss(), low_dim_hidden_idss);

    constexpr auto all_up_dim_hidden_idss =
-        tuple_cat(OldTensorDescriptor::GetUpperDimensionIdss(), up_dim_hidden_idss);
+        container_cat(OldTensorDescriptor::GetUpperDimensionIdss(), up_dim_hidden_idss);

    return DynamicTensorDescriptor_v2<decltype(all_transforms),
                                      decltype(all_low_dim_hidden_idss),
@@ -494,7 +494,7 @@ make_dynamic_tensor_coordinate_step_v2(const TensorDesc&, const VisibleIndex& id
        //   2) all components of lower index diff will assume to be non-zero and need to be
        //   computed
        const bool idx_diff_up_has_non_zero =
-            reduce_on_array(non_zero_diff_pick_up, [](auto a, auto b) { return a or b; }, false);
+            container_reduce(non_zero_diff_pick_up, [](auto a, auto b) { return a or b; }, false);

        do_transforms(itran) = idx_diff_up_has_non_zero;


--- a/composable_kernel/include/tensor_description/multi_index.hpp
+++ b/composable_kernel/include/tensor_description/multi_index.hpp
@@ -9,30 +9,11 @@ namespace ck {
 template <index_t N>
 using MultiIndex = Array<index_t, N>;

-#if 1 // works
 template <typename... Xs>
 __host__ __device__ constexpr auto make_multi_index(const Xs&... xs)
 {
    return make_array<const index_t>(std::forward<const Xs>(xs)...);
 }
-#else // doesn't work: don't know how to get the lvalue/rvalue reference correct
-template <typename... Xs>
-__host__ __device__ constexpr auto make_multi_index(Xs&&... xs)
-{
-    return make_array<const index_t>(std::forward<const Xs>(xs)...);
-}
-#endif
-
-#else
-template <index_t N>
-using MultiIndex = StaticallyIndexedArray<index_t, N>;
-
-template <typename... Xs>
-__host__ __device__ constexpr auto make_multi_index(const Xs&... xs)
-{
-    return make_statically_indexed_array<const index_t>(std::forward<const Xs>(xs)...);
-}
-#endif

 template <index_t NSize>
 __host__ __device__ constexpr auto make_zero_multi_index()
@@ -93,5 +74,84 @@ __host__ __device__ constexpr auto operator*(const MultiIndex<NSize>& a, const T
    return r;
 }

+#else
+template <index_t N>
+using MultiIndex = StaticallyIndexedArray<index_t, N>;
+
+template <typename... Xs>
+__host__ __device__ constexpr auto make_multi_index(const Xs&... xs)
+{
+    return make_statically_indexed_array<const index_t>(std::forward<const Xs>(xs)...);
+}
+
+template <index_t NSize>
+__host__ __device__ constexpr auto make_zero_multi_index()
+{
+    return unpack([](auto... xs) { return make_multi_index(xs...); },
+                  typename uniform_sequence_gen<NSize, 0>::type{});
+}
+
+template <typename T>
+__host__ __device__ constexpr auto to_multi_index(const T& x)
+{
+    return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
+}
+
+// Here should use MultiIndex<NSize>, instead of Tuple<Ys...>, although the former
+// is the alias of the latter. This is because compiler cannot infer the NSize if
+// using MultiIndex<NSize>
+// TODO: how to fix this?
+template <typename... Ys, typename X>
+__host__ __device__ constexpr auto operator+=(Tuple<Ys...>& y, const X& x)
+{
+    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Ys);
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
+    return y;
+}
+
+template <typename... Ys, typename X>
+__host__ __device__ constexpr auto operator-=(Tuple<Ys...>& y, const X& x)
+{
+    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Ys);
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
+    return y;
+}
+
+template <typename... Xs, typename Y>
+__host__ __device__ constexpr auto operator+(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] + y[i]; });
+    return r;
+}
+
+template <typename... Xs, typename Y>
+__host__ __device__ constexpr auto operator-(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] - y[i]; });
+    return r;
+}
+
+template <typename... Xs, typename Y>
+__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] * y[i]; });
+    return r;
+}
+
+#endif
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/array_helper.hpp
+++ b/composable_kernel/include/utility/array_helper.hpp
@@ -3,17 +3,17 @@

 #include "sequence.hpp"
 #include "sequence_helper.hpp"
-#include "tuple.hpp"
-#include "tuple_helper.hpp"
 #include "array.hpp"
 #include "array_helper.hpp"
+#include "tuple.hpp"
+#include "tuple_helper.hpp"
 #include "statically_indexed_array.hpp"
 #include "array_element_picker.hpp"

 namespace ck {

 template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto push_back(const Array<TData, NSize>& a, const TData& x)
+__host__ __device__ constexpr auto container_push_back(const Array<TData, NSize>& a, const TData& x)
 {
    Array<TData, NSize + 1> r;

@@ -25,137 +25,64 @@ __host__ __device__ constexpr auto push_back(const Array<TData, NSize>& a, const
 }

 template <typename TData, index_t NSize, index_t... IRs>
-__host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
-                                                               Sequence<IRs...> /*new2old*/)
+__host__ __device__ constexpr auto
+container_reorder_given_new2old(const Array<TData, NSize>& old_array, Sequence<IRs...> /*new2old*/)
 {
-    static_assert(NSize == sizeof...(IRs), "NSize not consistent");
+    static_assert(NSize == sizeof...(IRs), "wrong! size not consistent");

    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");

-    return Array<TData, NSize>{old_array[IRs]...};
+    return make_array(old_array[Number<IRs>{}]...);
 }

-template <typename TData, index_t NSize, typename MapOld2New>
-struct lambda_reorder_array_given_old2new
-{
-    const Array<TData, NSize>& old_array;
-    Array<TData, NSize>& new_array;
-
-    __host__ __device__ constexpr lambda_reorder_array_given_old2new(
-        const Array<TData, NSize>& old_array_, Array<TData, NSize>& new_array_)
-        : old_array(old_array_), new_array(new_array_)
-    {
-    }
-
-    template <index_t IOldDim>
-    __host__ __device__ constexpr void operator()(Number<IOldDim>) const
-    {
-        TData old_data = old_array[IOldDim];
-
-        constexpr index_t INewDim = MapOld2New::At(Number<IOldDim>{});
-
-        new_array(Number<INewDim>{}) = old_data;
-    }
-};
-
 template <typename TData, index_t NSize, index_t... IRs>
-__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array,
-                                                               Sequence<IRs...> /*old2new*/)
+__host__ __device__ constexpr auto
+container_reorder_given_old2new(const Array<TData, NSize>& old_array, Sequence<IRs...> old2new)
 {
-    Array<TData, NSize> new_array;
-
-    static_assert(NSize == sizeof...(IRs), "NSize not consistent");
-
-    static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
-
-    static_for<0, NSize, 1>{}(
-        lambda_reorder_array_given_old2new<TData, NSize, Sequence<IRs...>>(old_array, new_array));
-
-    return new_array;
+    return container_reorder_given_new2old(
+        old_array, typename sequence_map_inverse<decltype(old2new)>::type{});
 }

-// emulate constepxr lambda for array
-template <typename F, typename X, typename Y, typename Z>
-struct lambda_array_math
+template <typename... Ts, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_new2old(const Tuple<Ts...>& old_tuple,
+                                                                   Sequence<IRs...> /*new2old*/)
 {
-    const F& f;
-    const X& x;
-    const Y& y;
-    Z& z;
-
-    __host__ __device__ constexpr lambda_array_math(const F& f_, const X& x_, const Y& y_, Z& z_)
-        : f(f_), x(x_), y(y_), z(z_)
-    {
-    }
-
-    template <index_t IDim_>
-    __host__ __device__ constexpr void operator()(Number<IDim_>) const
-    {
-        constexpr auto IDim = Number<IDim_>{};
-        z(IDim)             = f(x[IDim], y[IDim]);
-    }
-};
-
-// Array = Sequence - Array
-template <typename TData, index_t NSize, index_t... Is>
-__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
-{
-    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
-
-    Array<TData, NSize> result;
+    static_assert(sizeof...(Ts) == sizeof...(IRs), "wrong! size not consistent");

-    auto f = math::minus<index_t>{};
-
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");

-    return result;
+    return make_tuple(old_tuple[Number<IRs>{}]...);
 }

-// Array = Array * TData
-template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
+template <typename... Ts, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_old2new(const Tuple<Ts...>& old_tuple,
+                                                                   Sequence<IRs...> old2new)
 {
-    Array<TData, NSize> result;
-
-    for(index_t i = 0; i < NSize; ++i)
-    {
-        result(i) = a[i] * v;
-    }
-
-    return result;
+    return container_reorder_given_new2old(
+        old_tuple, typename sequence_map_inverse<decltype(old2new)>::type{});
 }

-template <typename TData, typename Arr, typename Reduce>
-__host__ __device__ constexpr TData reduce_on_array(const Arr& a, Reduce f, TData init)
+template <typename TData, typename Container, typename Reduce>
+__host__ __device__ constexpr TData container_reduce(const Container& a, Reduce f, TData init)
 {
    // static_assert(is_same<typename Arr::data_type, TData>::value, "wrong! different data type");
-    static_assert(Arr::Size() > 0, "wrong");
+    static_assert(Container::Size() > 0, "wrong");

    TData result = init;

-    static_for<0, Arr::Size(), 1>{}([&](auto I) { result = f(result, a[I]); });
+    static_for<0, Container::Size(), 1>{}([&](auto I) { result = f(result, a[I]); });

    return result;
 }

 template <typename TData, index_t NSize, typename Reduce>
 __host__ __device__ constexpr auto
-reverse_inclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData init)
+container_reverse_inclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
 {
    Array<TData, NSize> y;

    TData r = init;

-#if 0
-#pragma unroll
-    for(index_t i = NSize - 1; i >= 0; --i)
-    {
-        r    = f(r, x[i]);
-        y(i) = r;
-    }
-#else
    static_for<NSize - 1, 0, -1>{}([&](auto i) {
        r    = f(r, x[i]);
        y(i) = r;
@@ -163,36 +90,61 @@ reverse_inclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData in

    r              = f(r, x[Number<0>{}]);
    y(Number<0>{}) = r;
-#endif

    return y;
 }

 template <typename TData, index_t NSize, typename Reduce>
 __host__ __device__ constexpr auto
-reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData init)
+container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
 {
    Array<TData, NSize> y;

    TData r = init;

-#if 0
-#pragma unroll
-    for(index_t i = NSize - 1; i > 0; --i)
-    {
+    static_for<NSize - 1, 0, -1>{}([&](auto i) {
        y(i) = r;
        r    = f(r, x[i]);
-    }
+    });
+
+    y(Number<0>{}) = r;
+
+    return y;
+}
+
+template <typename TData, index_t NSize, typename Reduce>
+__host__ __device__ constexpr auto container_reverse_exclusive_scan(
+    const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
+{
+    StaticallyIndexedArray<TData, NSize> y;
+
+    TData r = init;

-    y(0) = r;
-#else
    static_for<NSize - 1, 0, -1>{}([&](auto i) {
        y(i) = r;
        r    = f(r, x[i]);
    });

    y(Number<0>{}) = r;
-#endif
+
+    return y;
+}
+
+template <typename TData, index_t NSize, typename Reduce>
+__host__ __device__ constexpr auto container_reverse_inclusive_scan(
+    const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
+{
+    StaticallyIndexedArray<TData, NSize> y;
+
+    TData r = init;
+
+    static_for<NSize - 1, 0, -1>{}([&](auto i) {
+        r    = f(r, x[i]);
+        y(i) = r;
+    });
+
+    r              = f(r, x[Number<0>{}]);
+    y(Number<0>{}) = r;

    return y;
 }
@@ -204,19 +156,21 @@ __host__ __device__ constexpr auto container_cat(const X& x, const Ys&... ys)
 }

 template <typename T, index_t NX, index_t NY>
-__host__ __device__ constexpr auto container_cat(const Array<T, NX>& x, const Array<T, NY>& y)
+__host__ __device__ constexpr auto container_cat(const Array<T, NX>& ax, const Array<T, NY>& ay)
 {
-    Array<T, NX + NY> z;
-
-    static_for<0, NX, 1>{}([&](auto i) { z(i) = x[i]; });
-
-    static_for<0, NY, 1>{}([&](auto i) { z(i + Number<NX>{}) = y[i]; });
+    return unpack2(
+        [&](auto&&... zs) { return make_array(std::forward<decltype(zs)>(zs)...); }, ax, ay);
+}

-    return z;
+template <typename... X, typename... Y>
+__host__ __device__ constexpr auto container_cat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
+{
+    return unpack2(
+        [&](auto&&... zs) { return make_tuple(std::forward<decltype(zs)>(zs)...); }, tx, ty);
 }

-template <typename T, index_t N>
-__host__ __device__ constexpr auto container_cat(const Array<T, N>& x)
+template <typename Container>
+__host__ __device__ constexpr auto container_cat(const Container& x)
 {
    return x;
 }

--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -9,7 +9,7 @@
 #define CK_UNSIGNED_INDEX_TYPE 0

 // multi index
-#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
+#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 1

 // device backend
 #define CK_DEVICE_BACKEND_AMD 1

--- a/composable_kernel/include/utility/functional3.hpp
+++ b/composable_kernel/include/utility/functional3.hpp
@@ -63,7 +63,7 @@ struct ford_impl
        for(index_t i = 0; i < RemainLengths::Front(); ++i)
        {
            ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
-                f, push_back(current_ordered_id, i));
+                f, container_push_back(current_ordered_id, i));
        }
    }
 };
@@ -77,7 +77,7 @@ struct ford_impl<Sequence<>, Orders>
    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
    {
        // retrive unordered Id
-        f(reorder_array_given_old2new(current_ordered_id, Orders{}));
+        f(container_reorder_given_old2new(current_ordered_id, Orders{}));
    }
 };


--- a/composable_kernel/include/utility/functional4.hpp
+++ b/composable_kernel/include/utility/functional4.hpp
@@ -49,7 +49,7 @@ __host__ __device__ constexpr auto unpack(F&& f, X&& x)

 // TODO: properly implement unpack that takes any number of containers
 template <typename F, typename X, typename Y>
-__host__ __device__ constexpr auto unpack(F&& f, X&& x, Y&& y)
+__host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y)
 {
    using X_ = remove_reference_t<X>;
    using Y_ = remove_reference_t<Y>;

--- a/composable_kernel/include/utility/statically_indexed_array.hpp
+++ b/composable_kernel/include/utility/statically_indexed_array.hpp
@@ -36,42 +36,5 @@ __host__ __device__ constexpr auto make_statically_indexed_array()
    return StaticallyIndexedArray<X, 0>();
 }

-template <typename TData, index_t NSize, typename Reduce>
-__host__ __device__ constexpr auto
-reverse_exclusive_scan_on_array(const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
-{
-    StaticallyIndexedArray<TData, NSize> y;
-
-    TData r = init;
-
-    static_for<NSize - 1, 0, -1>{}([&](auto i) {
-        y(i) = r;
-        r    = f(r, x[i]);
-    });
-
-    y(Number<0>{}) = r;
-
-    return y;
-}
-
-template <typename TData, index_t NSize, typename Reduce>
-__host__ __device__ constexpr auto
-reverse_inclusive_scan_on_array(const StaticallyIndexedArray<TData, NSize>& x, Reduce f, TData init)
-{
-    StaticallyIndexedArray<TData, NSize> y;
-
-    TData r = init;
-
-    static_for<NSize - 1, 0, -1>{}([&](auto i) {
-        r    = f(r, x[i]);
-        y(i) = r;
-    });
-
-    r              = f(r, x[Number<0>{}]);
-    y(Number<0>{}) = r;
-
-    return y;
-}
-
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/tuple.hpp
+++ b/composable_kernel/include/utility/tuple.hpp
@@ -128,12 +128,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X

    __host__ __device__ constexpr Tuple(Tuple&&) = default;

-#if 0
    template <typename... Ys,
              typename std::enable_if<sizeof...(Ys) == sizeof...(Xs), bool>::type = false>
-#else
-    template <typename... Ys>
-#endif
    __host__ __device__ explicit constexpr Tuple(const Tuple<Ys...>& y)
        : base(static_cast<
               const detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Ys), 1>::type,
@@ -141,12 +137,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
    {
    }

-#if 0
    template <typename... Ys,
              typename std::enable_if<sizeof...(Ys) == sizeof...(Xs), bool>::type = false>
-#else
-    template <typename... Ys>
-#endif
    __host__ __device__ explicit constexpr Tuple(Tuple<Ys...>&& y)
        : base(static_cast<
               detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Ys), 1>::type,
@@ -154,7 +146,9 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
    {
    }

-    template <typename... Ys, typename std::enable_if<sizeof...(Ys) >= 1, bool>::type = false>
+    template <typename... Ys,
+              typename std::enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 1,
+                                      bool>::type = false>
    __host__ __device__ explicit constexpr Tuple(Ys&&... ys) : base(std::forward<Ys>(ys)...)
    {
    }

--- a/composable_kernel/include/utility/tuple_helper.hpp
+++ b/composable_kernel/include/utility/tuple_helper.hpp
@@ -12,13 +12,6 @@ __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
                  typename arithmetic_sequence_gen<0, N, 1>::type{});
 }

-template <typename... Tuples>
-__host__ __device__ constexpr auto tuple_cat(Tuples&&... tuples)
-{
-    return unpack([&](auto&&... xs) { return make_tuple(std::forward<decltype(xs)>(xs)...); },
-                  std::forward<Tuples>(tuples)...);
-}
-
 namespace detail {

 template <typename F, typename X, index_t... Is>

--- a/driver/include/device_dummy_dynamic_transform_v1.hpp
+++ b/driver/include/device_dummy_dynamic_transform_v1.hpp
@@ -52,7 +52,7 @@ void device_dummy_dynamic_transform_v1(InDesc,
        const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});

        auto in_gemmk_gemmn_coord =
-            make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, make_multi_index(0, 0));
+            make_dynamic_tensor_coordinate<2>(in_gemmk_gemmn_global_desc, make_multi_index(0, 0));

        for(index_t iter = 0; iter < 10; ++iter)
        {
@@ -115,10 +115,10 @@ void device_dummy_dynamic_transform_v1(InDesc,
                                                 const DynamicNativeTensorDescriptor<4>,
                                                 const DynamicNativeTensorDescriptor<4>,
                                                 const DynamicNativeTensorDescriptor<4>,
-                                                 const Array<index_t, 2>,
-                                                 const Array<index_t, 2>,
-                                                 const Array<index_t, 2>,
-                                                 const Array<index_t, 2>>,
+                                                 const MultiIndex<2>,
+                                                 const MultiIndex<2>,
+                                                 const MultiIndex<2>,
+                                                 const MultiIndex<2>>,
                          dim3(GridSize),
                          dim3(BlockSize),
                          0,

--- a/driver/include/device_dummy_dynamic_transform_v2.hpp
+++ b/driver/include/device_dummy_dynamic_transform_v2.hpp
@@ -96,7 +96,7 @@ void device_dummy_dynamic_transform_v2(InDesc,

        for(index_t j = 0; j < nrepeat; ++j)
        {
-#if 0
+#if 1
            launch_kernel(run_gridwise_operation<DummyDynamicTransform_v2_1<BlockSize>,
                                                 index_t* const,
                                                 float* const,

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -549,7 +549,7 @@ int main(int argc, char* argv[])
 #endif
    }

-#if 0
+#if 1
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,