prototype dynamic tensor descriptor

0b7fcca6 · Chao Liu · 4388f572 · 0b7fcca6 · 0b7fcca6 · 0b7fcca6
Commit 0b7fcca6 authored Aug 18, 2020 by Chao Liu
12 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ project(modular_convolution)
 #c++
 enable_language(CXX)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")

--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
@@ -414,7 +414,7 @@ struct DummyDynamicTransform
            idx[13] += idx_diff[13];
            // padding check
-            bool is_in_bound                       = true;
+            bool is_in_bound = true;
 #else   // pad
            // offset
            idx[0] += idx_diff[0];
@@ -462,25 +462,102 @@ struct DummyDynamicTransform
                           const Array<index_t, 2> in_left_pads,
                           const Array<index_t, 2> in_right_pads) const
    {
+        const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
+        const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
+        const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
-        Index idx_up;
+        const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
+        const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
-        idx_up(0) = in_n_c_hi_wi_global_desc.GetLength(0);
+        const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
-        idx_up(1) = in_n_c_hi_wi_global_desc.GetLength(1);
+        const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
-        idx_up(2) = in_n_c_hi_wi_global_desc.GetLength(2);
-        idx_up(3) = in_n_c_hi_wi_global_desc.GetLength(3);
-#if 0
+        const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
-        constexpr auto trans = GetTransforms();
+        const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
-        auto idx_low = trans[0]->CalculateLowerIndex(idx_up);
+        const index_t ConvStrideH = conv_strides[0];
-#elif 1
+        const index_t ConvStrideW = conv_strides[1];
-        constexpr DynamicCoordinateTransform* tran = &embed;
-        auto idx_low = tran->CalculateLowerIndex(idx_up);
+        const index_t ConvDilationH = conv_dilations[0];
-#endif
+        const index_t ConvDilationW = conv_dilations[1];
+        const index_t InLeftPadH  = in_left_pads[0];
+        const index_t InLeftPadW  = in_left_pads[1];
+        const index_t InRightPadH = in_right_pads[0];
+        const index_t InRightPadW = in_right_pads[1];
-        p_out_global[get_thread_local_1d_id()] = idx_low[0];
+        // input tensor
+        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+            transform_dynamic_tensor_descriptor(
+                in_n_c_hi_wi_global_desc,
+                make_tuple(DynamicPassThrough{N},
+                           DynamicPassThrough{C},
+                           DynamicLeftPad{Hi, InLeftPadH},
+                           DynamicLeftPad{Wi, InLeftPadW}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
+            make_tuple(DynamicPassThrough{N},
+                       DynamicPassThrough{C},
+                       DynamicRightPad{Hi + InLeftPadH, InRightPadH},
+                       DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
+        const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
+        const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_hip_wip_global_desc,
+            make_tuple(DynamicPassThrough{N},
+                       DynamicPassThrough{C},
+                       DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
+                       DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+        const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+            in_n_c_y_ho_x_wo_global_desc,
+            make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}),
+            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+#pragma unroll 1
+        for(index_t iter = 0; iter < 100; ++iter)
+        {
+            //
+            MultiIndex<2> idx;
+            // initialize idx
+            for(index_t i = 0; i < 2; ++i)
+            {
+                idx(i) = p_wei_global[10 * iter + get_thread_local_1d_id() + i];
+            }
+            // offset
+            index_t offset = in_gemmk_gemmn_global_desc.CalculateOffset(idx);
+            // is_in_bound
+            bool is_in_bound =
+                in_gemmk_gemmn_global_desc.IsValidUpperIndexMappedToValidLowerIndex(idx);
+            // write
+            float value = 1;
+            transfer_data<float,
+                          1,
+                          AddressSpace::Vgpr,
+                          AddressSpace::Global,
+                          InMemoryDataOperation::Set,
+                          1,
+                          1>(&value,
+                             0,
+                             true,
+                             1,
+                             p_out_global,
+                             offset,
+                             is_in_bound,
+                             out_n_k_ho_wo_global_desc.GetElementSpace());
+        }
    }
    __device__ void Run(index_t* const __restrict__ p_wei_global,

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -10,30 +10,41 @@ struct DynamicPassThrough
    using LowerIndex = MultiIndex<1>;
    using UpperIndex = MultiIndex<1>;
-    const index_t low_length_;
+    const index_t up_length_;
    __host__ __device__ explicit constexpr DynamicPassThrough(const index_t& low_length)
-        : low_length_(low_length)
+        : up_length_{low_length}
    {
    }
+    __host__ __device__ explicit constexpr DynamicPassThrough() : up_length_{0} {}
    __host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; }
    __host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; }
-    __host__ __device__ constexpr auto GetUpperIndex() { return UpperIndex({low_length_}); }
+    __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
-    __host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up)
    {
-        return idx_up;
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low(0) = idx_up[0];
    }
-    __host__ __device__ static constexpr auto
+    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                            const UpperIndex& /* idx_up_old */,
+                                                            const UpIdxDiff& idx_up_diff,
-                            const LowerIndex& /* idx_low_old */)
+                                                            const LowIdx& /* idx_low_old */,
+                                                            const UpIdx& /* idx_up_old */)
    {
-        return idx_up_diff;
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low_diff(0) = idx_up_diff[0];
    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -43,8 +54,9 @@ struct DynamicPassThrough
        return true;
    }
+    template <typename UpIdx>
    __host__ __device__ static constexpr bool
-    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
    {
        return true;
    }
@@ -56,34 +68,43 @@ struct DynamicLeftPad
    using LowerIndex = MultiIndex<1>;
    using UpperIndex = MultiIndex<1>;
-    const index_t low_length_;
+    const index_t up_length_;
    const index_t left_pad_;
-    __host__ __device__ explicit constexpr Pad(const index_t& low_length, const index_t& left_pad)
+    __host__ __device__ explicit constexpr DynamicLeftPad(const index_t& low_length,
-        : low_length_{low_length}, left_pad_{left_pad}
+                                                          const index_t& left_pad)
+        : up_length_{low_length + left_pad}, left_pad_{left_pad}
    {
    }
-    __host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ explicit constexpr DynamicLeftPad() : up_length_{0}, left_pad_{0} {}
-    __host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
-    __host__ __device__ constexpr auto GetUpperIndex()
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
-    {
-        return UpperIndex({low_length_ + left_pad_});
-    }
-    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
    {
-        return LowerIndex{idx_up - lef_pad_};
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low(0) = idx_up[0] - left_pad_;
    }
-    __host__ __device__ static constexpr auto
+    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                            const UpperIndex& /* idx_up_old */,
+                                                            const UpIdxDiff& idx_up_diff,
-                            const LowerIndex& /* idx_low_old */)
+                                                            const LowIdx& /* idx_low_old */,
+                                                            const UpIdx& /* idx_up_old */)
    {
-        return idx_up_diff;
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low_diff(0) = idx_up_diff[0];
    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -93,8 +114,9 @@ struct DynamicLeftPad
        return SkipIsValidCheck;
    }
-    __host__ __device__ static constexpr bool
+    template <typename UpIdx>
-    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up)
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
    {
        return SkipIsValidCheck || (idx_up[0] >= left_pad_);
    }
@@ -106,34 +128,47 @@ struct DynamicRightPad
    using LowerIndex = MultiIndex<1>;
    using UpperIndex = MultiIndex<1>;
+    const index_t up_length_;
    const index_t low_length_;
    const index_t right_pad_;
-    __host__ __device__ explicit constexpr Pad(const index_t& low_length, const index_t& right_pad)
+    __host__ __device__ explicit constexpr DynamicRightPad(const index_t& low_length,
-        : low_length_{low_length}, right_pad_{right_pad}
+                                                           const index_t& right_pad)
+        : up_length_{low_length + right_pad}, low_length_{low_length}, right_pad_{right_pad}
    {
    }
-    __host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ explicit constexpr DynamicRightPad()
+        : up_length_{0}, low_length_{0}, right_pad_{0}
-    __host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; }
-    __host__ __device__ constexpr auto GetUpperIndex()
    {
-        return UpperIndex({low_length_ + right_pad_});
    }
-    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ static void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up)
    {
-        return idx_up;
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low(0) = idx_up[0];
    }
-    __host__ __device__ static constexpr auto
+    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                            const UpperIndex& /* idx_up_old */,
+                                                            const UpIdxDiff& idx_up_diff,
-                            const LowerIndex& /* idx_low_old */)
+                                                            const LowIdx& /* idx_low_old */,
+                                                            const UpIdx& /* idx_up_old */)
    {
-        return idx_up_diff;
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low_diff(0) = idx_up_diff[0];
    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -143,8 +178,9 @@ struct DynamicRightPad
        return SkipIsValidCheck;
    }
-    __host__ __device__ static constexpr bool
+    template <typename UpIdx>
-    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up)
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
    {
        return SkipIsValidCheck || (idx_up[0] < low_length_);
    }
@@ -157,50 +193,61 @@ struct DynamicEmbed
    using LowerIndex = MultiIndex<1>;
    using UpperIndex = MultiIndex<NDimUp>;
-    const index_t low_length_;
    const UpperIndex up_lengths_;
    const Array<index_t, NDimUp + 1> coefficients_;
-    __host__ __device__ explicit constexpr Embed(const index_t& low_length,
+    __host__
-                                                 const UpperIndex& up_lengths,
+        __device__ explicit constexpr DynamicEmbed(const UpperIndex& up_lengths,
-                                                 const Array<index_t, NDimUp + 1>& coefficients)
+                                                   const Array<index_t, NDimUp + 1>& coefficients)
-        : low_length_(low_length), up_lengths_(up_lengths), coefficients_(coefficients)
+        : up_lengths_{up_lengths}, coefficients_{coefficients}
+    {
+        static_assert(UpperIndex::Size() == NDimUp, "wrong! # of dimensions not consistent");
+    }
+    __host__ __device__ explicit constexpr DynamicEmbed()
+        : up_lengths_{make_zero_array<index_t, NDimUp>()},
+          coefficients_{make_zero_array<index_t, NDimUp + 1>()}
    {
-        static_assert(up_lengths.GetSize() == nDimUp && coefficients.GetSize() == nDimUp + 1,
-                      "wrong! # of dimensions not consistent");
    }
    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimUp; }
-    __host__ __device__ static constexpr auto GetUpperIndex() { return up_lengths_; }
+    __host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }
-    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
    {
-        index_t idx_low = coefficients_[NDimUp];
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
+                      "wrong! inconsistent # of dimension");
+        idx_low(0) = coefficients_[NDimUp];
-        for(index_t i = 0; i < nDimUp; ++i)
+#pragma unroll
+        for(index_t i = 0; i < NDimUp; ++i)
        {
-            idx_low += idx_up[i] * coefficients_[i];
+            idx_low(0) += idx_up[i] * coefficients_[i];
        }
-        return LowerIndex({idx_low});
    }
-    __host__ __device__ static constexpr auto
+    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+    __host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                            const UpperIndex& /* idx_up_old */,
+                                                     const UpIdxDiff& idx_up_diff,
-                            const LowerIndex& /* idx_low_old */)
+                                                     const LowIdx& /* idx_low_old */,
+                                                     const UpIdx& /* idx_up_old */) const
    {
-        index_t idx_low_diff = 0;
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == NDimUp &&
+                          LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
+                      "wrong! inconsistent # of dimension");
-        for(index_t i = 0; i < nDimUp; ++i)
+        idx_low_diff(0) = 0;
+#pragma unroll
+        for(index_t i = 0; i < NDimUp; ++i)
        {
-            idx_low_diff += idx_up_diff[i] * Coefficients{}[i];
+            idx_low_diff(0) += idx_up_diff[i] * coefficients_[i];
        }
-        return LowerIndex({idx_low_diff});
    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -210,8 +257,9 @@ struct DynamicEmbed
        return true;
    }
+    template <typename UpIdx>
    __host__ __device__ static constexpr bool
-    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
    {
        return true;
    }
@@ -227,27 +275,33 @@ struct DynamicMerge
    const LowerIndex low_lengths_scan_;
    const index_t up_length_;
-    __host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths_)
+    __host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths)
-        : low_lengths_(low_lengths), 
+        : low_lengths_{low_lengths},
-          low_lengths_scan_(reverse_inclusive_scan_on_array(low_lengths, multiplies<index_t>()),
+          low_lengths_scan_{reverse_inclusive_scan_on_array(
-          up_length(accumulate_on_array(low_lengths, multiplies<index_t>(), 1))
+              low_lengths, math::multiplies<index_t>{}, index_t{1})},
+          up_length_{reduce_on_array(low_lengths, math::multiplies<index_t>(), 1)}
    {
+        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
    }
-    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() {
+    __host__ __device__ explicit constexpr DynamicMerge()
-        return NDimLow; }
+        : low_lengths_{make_zero_array<index_t, NDimLow>()},
+          low_lengths_scan_{make_zero_array<index_t, NDimLow>()},
-    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() {
+          up_length_{0}
-        return 1; }
-    __host__ __device__ constexpr auto GetUpperIndex() const
    {
-        return UpperIndex({up_length_});
    }
-    __host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) const
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
    {
-        LowerIndex idx_low;
+        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
        index_t itmp = idx_up[0];
@@ -259,29 +313,30 @@ struct DynamicMerge
        }
        idx_low(NDimLow - 1) = itmp;
-#pragma unroll
-        return idx_low;
    }
    // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
    // If idx_up_diff is known at compile-time, many calculations can be optimized
    // away by compiler
    // This function assume idx_low_old is not out-of-bound
-    __host__ __device__ static constexpr auto
+    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+    __host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                            const UpperIndex& /* idx_up_old */,
+                                                     const UpIdxDiff& idx_up_diff,
-                            const LowerIndex& idx_low_old)
+                                                     const LowIdx& idx_low_old,
-    {
+                                                     const UpIdx& /* idx_up_old */) const
-        LowerIndex idx_low_diff;
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
-        // CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        // CalculateLowerIndex(idx_low_diff_const) has multiple integer divisions.
+        // However,
        //   1) If idx_up_diff is known at compile-time, then idx_low_diff_const
        //   can be calculated at compile-time.
        //   2) If idx_up_diff is not known at compile-time, but its value
        //   doesn't change during the whole kernel execution, then idx_low_diff_const also
        //   doesn't change during the whole kernel execution. Compiler generated ISA should
-        //   only caclculate idx_low_diff once and save it durinng the whole kernel execution
+        //   only caclculate idx_low_diff_const once and save it durinng the whole kernel execution
        // If neither 1) nor 2) is satisfied, then the calculation will also be computed at
        //   run-time each time this function is called, and can be very expensive.
        LowerIndex idx_low_diff_const = CalculateLowerIndex(idx_up_diff);
@@ -303,19 +358,19 @@ struct DynamicMerge
            index_t idx_low_tmp[i] = idx_low_old[i] + carry;
-            bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_low_diff_const;
+            bool do_carry = idx_low_tmp[i] >= idx_low_length_minus_idx_low_diff_const;
 #if 0
-            bool do_borrow = idx_low_tmp < -idx_low_diff_const[i];
+            bool do_borrow = idx_low_tmp[i] < -idx_low_diff_const[i];
 #endif
-            idx_low_diff[i] =
+            idx_low_diff(i) =
                do_carry ? -idx_low_length_minus_idx_low_diff_const : idx_low_diff_const;
 #if 0
-            idx_low_diff[i] =
+            idx_low_diff(i) =
                do_borrow ? idx_low_length_plus_idx_low_diff_const : idx_low_diff[i];
 #endif
-            idx_low_diff[i] += carry;
+            idx_low_diff(i) += carry;
            carry = do_carry ? 1 : 0;
 #if 0
@@ -323,21 +378,19 @@ struct DynamicMerge
 #endif
        }
-        idx_low_diff[0] = idx_low_diff_const[0] + carry;
+        idx_low_diff(0) = idx_low_diff_const[0] + carry;
-        return idx_low_diff;
    }
-    __host__ __device__ static constexpr bool IsLinearTransform() {
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
-        return false; }
    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
    {
        return true;
    }
+    template <typename UpIdx>
    __host__ __device__ static constexpr bool
-    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
    {
        return true;
    }
@@ -353,56 +406,55 @@ struct DynamicUnMerge
    const UpperIndex up_lengths_scan_;
    __host__ __device__ explicit constexpr DynamicUnMerge(const UpperIndex& up_lengths)
-        : up_lengths_(up_lengths), 
+        : up_lengths_{up_lengths},
-          up_lengths_scan_(reverse_exclusive_scan_on_array(up_lengths, multiplies<index_t>(), index_t(1))
+          up_lengths_scan_{
+              reverse_exclusive_scan_on_array(up_lengths, math::multiplies<index_t>(), index_t{1})}
+    {
+    }
+    __host__ __device__ explicit constexpr DynamicUnMerge()
+        : up_lengths_{make_zero_array<index_t, NDimUp>()},
+          up_lengths_scan_{make_zero_array<index_t, NDimUp>()}
    {
    }
-    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() {
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
-        return 1; }
-    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() {
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
-        return NDimUp; }
-    __host__ __device__ constexpr auto GetUpperIndex() const {
+    __host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }
-        return up_lengths_; }
-    __host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) const
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
    {
-        index_t idx_low = idx_up[NDimUp];
+        idx_low(0) = idx_up[NDimUp];
 #pragma unroll
        for(index_t i = 0; i < NDimUp - 1; ++i)
        {
-            idx_low += idx_up[i] * up_lengths_scan_[i];
+            idx_low(0) += idx_up[i] * up_lengths_scan_[i];
        }
-        return LowerIndex{idx_low};
    }
-    __host__ __device__ static constexpr auto
+    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+    __host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                            const UpperIndex& /* idx_up_old */,
+                                                     const UpIdxDiff& idx_up_diff,
-                            const LowerIndex& /* idx_low_old */)
+                                                     const LowIdx& /* idx_low_old */,
+                                                     const UpIdx& /* idx_up_old */) const
    {
-        return CalculateLowerIndex(idx_up_diff);
+        CalculateLowerIndex(idx_low_diff, idx_up_diff);
    }
-    __host__ __device__ static constexpr bool IsLinearTransform() {
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
-        return true; }
-    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
-    {
-        return true;
-    }
    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
    {
        return true;
    }
+    template <typename UpIdx>
    __host__ __device__ static constexpr bool
-    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
    {
        return true;
    }
@@ -411,33 +463,38 @@ struct DynamicUnMerge
 struct DynamicFreeze
 {
    using LowerIndex = MultiIndex<1>;
-    using UpperIndex = MultiIndex<>;
+    using UpperIndex = MultiIndex<0>;
    const index_t low_idx_;
-    const index_t low_length_;
-    __host__ __device__ explicit constexpr Freeze(const index_t& low_idx, const index_t& low_length)
+    __host__ __device__ explicit constexpr DynamicFreeze(const index_t& low_idx) : low_idx_{low_idx}
-        : low_idx_(low_idx), low_length_(low_length)
    {
    }
+    __host__ __device__ explicit constexpr DynamicFreeze() : low_idx_{0} {}
    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 0; }
-    __host__ __device__ static constexpr auto GetUpperIndex() { return UpperIndex(); }
+    __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{}; }
-    __host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& /*idx_up*/) const
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
    {
-        return LowerIndex({low_length_});
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        idx_low(0) = low_idx_;
    }
-    __host__ __device__ static constexpr auto
+    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    CalculateLowerIndexDiff(const UpperIndex& /* idx_up_diff */,
+    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                            const UpperIndex& /* idx_up_old */,
+                                                            const UpIdxDiff& idx_up_diff,
-                            const LowerIndex& /* idx_low_old */)
+                                                            const LowIdx& /* idx_low_old */,
+                                                            const UpIdx& /* idx_up_old */)
    {
-        return LowerIndex({0});
+        idx_low_diff(0) = index_t{0};
    }
    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -447,8 +504,9 @@ struct DynamicFreeze
        return true;
    }
+    template <typename UpIdx>
    __host__ __device__ static constexpr bool
-    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
    {
        return true;
    }

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
@@ -6,8 +6,323 @@
 namespace ck {
-struct TensorDescriptor
+template <index_t NDim>
+struct DynamicNativeTensorDescriptor
 {
+    using Index = MultiIndex<NDim>;
+    const Index lengths_;
+    const Index strides_;
+    __host__ __device__ explicit constexpr DynamicNativeTensorDescriptor(const Index& lengths,
+                                                                         const Index& strides)
+        : lengths_{lengths}, strides_{strides}
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfDimension() { return NDim; }
+    __host__ __device__ constexpr auto GetLengths() const { return lengths_; }
+    __host__ __device__ constexpr auto GetStrides() const { return strides_; }
+    __host__ __device__ constexpr index_t GetLength(index_t idim) const { return lengths_[idim]; }
+    __host__ __device__ constexpr index_t GetStride(index_t idim) const { return strides_[idim]; }
+    __host__ __device__ constexpr index_t GetElementSize() const
+    {
+        return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
+    }
+    __host__ __device__ constexpr index_t GetElementSpace() const
+    {
+        index_t space = 1;
+#pragma unroll
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            space += (GetLength(i) - 1) * GetStride(i);
+        }
+        return space;
+    }
+    template <typename Idx>
+    __host__ __device__ constexpr index_t CalculateOffset(const Idx& idx) const
+    {
+        index_t offset = 0;
+#pragma unroll
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            offset += idx[i] * GetStride(i);
+        }
+        return offset;
+    }
+    template <typename UpIdxDiff, typename UpIdx, typename LowIdx>
+    __host__ __device__ constexpr index_t CalculateOffsetDiff(const UpIdxDiff& idx_up_diff,
+                                                              const LowIdx& /* idx_low_old */,
+                                                              const UpIdx& /* idx_up_old */) const
+    {
+        return CalculateOffset(idx_up_diff);
+    }
+    template <typename Idx>
+    __host__ __device__ constexpr bool IsUpperIndexValid(const Idx& idx) const
+    {
+        bool flag = true;
+#pragma unroll
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            flag = flag && idx[i] >= 0 && idx[i] < GetLength(i);
+        }
+        return flag;
+    }
+};
+template <typename LowTensorDescriptor, // DynamicNativeTensorDescriptor or
+                                        // DynamicTransformedTensorDescriptor
+          typename Transforms,          // Tuple<MultIndexTransforms...>
+          typename LowDimensionIds,     // Tuple<Sequence<...>>
+          typename UpDimensionIds>      // Tuple<Sequence<...>>
+struct DynamicTransformedTensorDescriptor
+{
+    const LowTensorDescriptor low_tensor_desc_;
+    const Transforms transforms_;
+    static constexpr index_t NTransform = Transforms::Size();
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension()
+    {
+        return LowTensorDescriptor::GetNumOfDimension();
+    }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension()
+    {
+        index_t ndim_up = 0;
+        static_for<0, NTransform, 1>{}([&](auto i) constexpr {
+            constexpr auto tmp = UpDimensionIds{}.At(i);
+            ndim_up += decltype(tmp)::Size();
+        });
+        return ndim_up;
+    }
+    static constexpr index_t NDimUp  = GetNumOfUpperDimension();
+    static constexpr index_t NDimLow = GetNumOfLowerDimension();
+    using UpperIndex = MultiIndex<NDimUp>;
+    using LowerIndex = MultiIndex<NDimLow>;
+    struct lambda_merge_sequences
+    {
+        template <typename... Xs>
+        __host__ __device__ constexpr auto operator()(Xs... xs) const
+        {
+            return merge_sequences(xs...);
+        }
+    };
+    struct lambda_merge_arrays
+    {
+        template <typename... Xs>
+        __host__ __device__ constexpr auto operator()(Xs... xs) const
+        {
+            return merge_arrays(xs...);
+        }
+    };
+    __host__ __device__ explicit constexpr DynamicTransformedTensorDescriptor(
+        const LowTensorDescriptor& low_tensor_desc, const Transforms& transforms)
+        : low_tensor_desc_{low_tensor_desc}, transforms_{transforms}
+    {
+        static_assert(NTransform == Transforms::Size() && NTransform == LowDimensionIds::Size() &&
+                          NTransform == UpDimensionIds::Size(),
+                      "wrong! # of transformations not the same");
+        // sanity check:
+        //   LowDimensionIds should include all low-dimensions,
+        //   UpDimensionIds should include all up-dimensions
+        using unsorted_up_dimension_ids =
+            decltype(unpack(lambda_merge_sequences{}, UpDimensionIds{}));
+        using sorted_up_dimension_ids =
+            typename sequence_sort<unsorted_up_dimension_ids, math::less<index_t>>::type;
+        static_assert(sorted_up_dimension_ids::Size() == NDimUp &&
+                          is_valid_sequence_map<sorted_up_dimension_ids>{},
+                      "wrong! UpDimensionIds is not configured correctly");
+        using unsorted_low_dimension_ids =
+            decltype(unpack(lambda_merge_sequences{}, LowDimensionIds{}));
+        using sorted_low_dimension_ids =
+            typename sequence_sort<unsorted_low_dimension_ids, math::less<index_t>>::type;
+        static_assert(sorted_low_dimension_ids::Size() == NDimLow &&
+                          is_valid_sequence_map<sorted_low_dimension_ids>{},
+                      "wrong! LowDimensionIds is not configured correctly");
+        // TODO: sanity check: while a up-dimension could be associated with multille
+        //   transformation, a low-dimension should be associated with only one transformation
+        // TODO: sanity-check: GetLowerLengths of each transform should be consistent with lengths
+        //   of lower-tensor-descriptor
+    }
+    __host__ __device__ static constexpr auto GetNumOfDimension()
+    {
+        return GetNumOfUpperDimension();
+    }
+    __host__ __device__ constexpr auto GetUpperLengths() const
+    {
+        // sort upper-dimension-ids
+        constexpr auto unsorted_up_dimension_ids =
+            unpack(lambda_merge_sequences{}, UpDimensionIds{});
+        using sort_up_dimension_ids = sequence_unique_sort<decltype(unsorted_up_dimension_ids),
+                                                           math::less<index_t>,
+                                                           math::equal<index_t>>;
+        constexpr auto sorted2unsorted_map = typename sort_up_dimension_ids::sorted2unsorted_map{};
+        // sort upper-lengths
+        const auto tuple_of_up_lengths =
+            transform_tuples([](const auto& tran) constexpr { return tran.GetUpperLengths(); },
+                             transforms_);
+        const auto unsorted_up_lengths = unpack(lambda_merge_arrays{}, tuple_of_up_lengths);
+        const auto sorted_up_lengths =
+            reorder_array_given_new2old(unsorted_up_lengths, sorted2unsorted_map);
+        return sorted_up_lengths;
+    }
+    __host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
+    __host__ __device__ constexpr index_t GetLength(index_t idim) const
+    {
+        return GetLengths()[idim];
+    }
+    __host__ __device__ constexpr index_t GetElementSize() const
+    {
+        return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
+    }
+    __host__ __device__ constexpr index_t GetElementSpace() const
+    {
+        return low_tensor_desc_.GetElementSpace();
+    }
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
+    {
+        static_for<0, NTransform, 1>{}([&](auto itran) constexpr {
+            auto tran = transforms_.At(itran);
+            auto idx_up_part  = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
+            auto idx_low_part = pick_array_element(idx_low, LowDimensionIds{}.At(itran));
+            tran.CalculateLowerIndex(idx_low_part, idx_up_part);
+        });
+    }
+    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
+    __host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
+                                                     const UpIdxDiff& idx_up_diff,
+                                                     const LowIdx& idx_low_old,
+                                                     const UpIdx& idx_up_old) const
+    {
+        static_for<0, NTransform, 1>{}([&](auto itran) {
+            const auto tran = transforms_.At(itran);
+            const auto idx_up_diff_part =
+                pick_array_element(idx_up_diff, UpDimensionIds{}.At(itran));
+            const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensionIds{}.At(itran));
+            const auto idx_low_old_part =
+                pick_array_element(idx_low_old, LowDimensionIds{}.At(itran));
+            auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensionIds{}.At(itran));
+            tran.CalculateLowerIndexDiff(
+                idx_low_diff_part, idx_up_diff_part, idx_low_old_part, idx_up_old_part);
+        });
+    }
+    template <typename UpIdx>
+    __host__ __device__ constexpr auto CalculateLowerIndex(const UpIdx& idx_up) const
+    {
+        LowerIndex idx_low;
+        CalculateLowerIndex(idx_low, idx_up);
+        return idx_low;
+    }
+    template <typename UpIdxDiff, typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr auto CalculateLowerIndexDiff(const UpIdxDiff& idx_up_diff,
+                                                               const LowIdx& idx_low_old,
+                                                               const UpIdx& idx_up_old) const
+    {
+        LowerIndex idx_low_diff;
+        CalculateLowerIndex(idx_low_diff, idx_up_diff, idx_low_old, idx_up_old);
+        return idx_low_diff;
+    }
+    __host__ __device__ constexpr index_t CalculateOffset(const UpperIndex& idx_up) const
+    {
+        return low_tensor_desc_.CalculateOffset(CalculateLowerIndex(idx_up));
+    }
+    __host__ __device__ constexpr bool IsUpperIndexValid(const UpperIndex& idx_up) const
+    {
+        bool flag = true;
+#pragma unroll
+        for(index_t i = 0; i < NDimUp; ++i)
+        {
+            flag = flag && idx_up[i] >= 0 && idx_up[i] < GetLength(i);
+        }
+        return flag;
+    }
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) const
+    {
+        bool flag = true;
+        static_for<0, NTransform, 1>{}([&](auto itran) {
+            const auto tran = Transforms{}.At(itran);
+            // check a indtransformation if it does not always has a valid mapping
+            constexpr bool is_valid_up_always_mapped_to_valid_low =
+                decltype(tran)::IsValidUpperIndexAlwaysMappedToValidLowerIndex();
+            if
+                constexpr(!is_valid_up_always_mapped_to_valid_low)
+                {
+                    const auto up_dims_part = UpDimensionIds{}.At(itran);
+                    const auto idx_up_part  = pick_array_element(idx_up, up_dims_part);
+                    flag = flag && IsValidUpperIndexMappedToValidLowerIndex(idx_up_part);
+                }
+        });
+        return flag;
+    }
 };
 } // namespace ck

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
@@ -15,5 +15,21 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L
    return DynamicNativeTensorDescriptor<Lengths::GetSize()>(lengths, strides);
 }
+template <typename LowTensorDescriptor,
+          typename Transforms,
+          typename LowDimensionIds,
+          typename UpDimensionIds>
+__host__ __device__ constexpr auto
+transform_dynamic_tensor_descriptor(const LowTensorDescriptor& low_tensor_desc,
+                                    const Transforms& transforms,
+                                    LowDimensionIds,
+                                    UpDimensionIds)
+{
+    return DynamicTransformedTensorDescriptor<LowTensorDescriptor,
+                                              Transforms,
+                                              LowDimensionIds,
+                                              UpDimensionIds>{low_tensor_desc, transforms};
+}
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -531,47 +531,5 @@ struct Freeze
    }
 };
-template <index_t LowerLength, index_t VectorSize>
-struct Vectorize
-{
-    using LowerIndex = MultiIndex<1>;
-    using UpperIndex = MultiIndex<1>;
-    __host__ __device__ constexpr Vectorize()
-    {
-        static_assert(VectorSize > 0 && LowerLength % VectorSize == 0,
-                      "wrong! cannot evenly divide");
-    }
-    __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
-    __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
-    __host__ __device__ static constexpr auto GetUpperLengths()
-    {
-        return Sequence<LowerLength / VectorSize>{};
-    }
-    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
-    {
-        return VectorSize * idx_up;
-    }
-    __host__ __device__ static constexpr auto
-    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
-                            const UpperIndex& /* idx_up_old */,
-                            const LowerIndex& /* idx_low_old */)
-    {
-        return VectorSize * idx_up_diff;
-    }
-    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
-    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
-    {
-        return true;
-    }
-};
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/array.hpp
+++ b/composable_kernel/include/utility/array.hpp
@@ -12,8 +12,9 @@ struct Array
    using type      = Array<TData, NSize>;
    using data_type = TData;
+    // hack: add extra element to allow empty array
    // TODO: implement empty Array
-    TData mData[NSize] = {0};
+    TData mData[NSize + 1] = {0};
    __host__ __device__ explicit constexpr Array() {}
@@ -136,16 +137,16 @@ struct ArrayElementPicker
        return mArray(IP);
    }
-    template <typename I>
+    __host__ __device__ constexpr const data_type& operator[](index_t i) const
-    __host__ __device__ constexpr const data_type& operator[](I i) const
    {
-        return At(i);
+        index_t ip = Picks{}[i];
+        return mArray[ip];
    }
-    template <typename I>
+    __host__ __device__ constexpr data_type& operator()(index_t i)
-    __host__ __device__ constexpr data_type& operator()(I i)
    {
-        return At(i);
+        index_t ip = Picks{}[i];
+        return mArray(ip);
    }
    template <typename T>

--- a/composable_kernel/include/utility/array_helper.hpp
+++ b/composable_kernel/include/utility/array_helper.hpp
@@ -244,7 +244,7 @@ __host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
 template <typename TData, index_t NSize, typename Reduce>
 __host__ __device__ constexpr TData
-accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
+reduce_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
 {
    TData result = init;
@@ -288,10 +288,40 @@ reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData in
        r    = f(r, x[i]);
    }
-    y(i) = r;
+    y(NSize - 1) = r;
    return y;
 }
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto merge_arrays(const X& x, const Ys&... ys)
+{
+    return merge_arrays(x, merge_arrays(ys...));
+}
+template <typename T, index_t NX, index_t NY>
+__host__ __device__ constexpr auto merge_arrays(const Array<T, NX>& x, const Array<T, NY>& y)
+{
+    Array<T, NX + NY> z;
+    for(index_t i = 0; i < NX; ++i)
+    {
+        z(i) = x[i];
+    }
+    for(index_t i = 0; i < NY; ++i)
+    {
+        z(i + NX) = y[i];
+    }
+    return z;
+}
+template <typename X>
+__host__ __device__ constexpr auto merge_arrays(const X& x)
+{
+    return x;
+}
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -8,9 +8,11 @@
 #include "float_type.hpp"
 #include "type.hpp"
 #include "tuple.hpp"
+#include "tuple_helper.hpp"
 #include "math.hpp"
 #include "sequence.hpp"
 #include "array.hpp"
+#include "array_helper.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "functional3.hpp"

--- a/composable_kernel/include/utility/tuple.hpp
+++ b/composable_kernel/include/utility/tuple.hpp
@@ -104,56 +104,5 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
    }
 };
-template <typename... Xs>
-__host__ __device__ constexpr auto make_tuple(Xs&&... xs)
-{
-    return Tuple<remove_cv_t<remove_reference_t<Xs>>...>(std::forward<Xs>(xs)...);
-}
-namespace detail {
-template <typename F, typename X, index_t... Is>
-__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence<Is...>)
-{
-    return make_tuple(f(x.At(Number<Is>{}))...);
-}
-template <typename F, typename X, typename Y, index_t... Is>
-__host__ __device__ constexpr auto
-transform_tuples_impl(F f, const X& x, const Y& y, Sequence<Is...>)
-{
-    return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}))...);
-}
-template <typename F, typename X, typename Y, typename Z, index_t... Is>
-__host__ __device__ constexpr auto
-transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, Sequence<Is...>)
-{
-    return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}), z.At(Number<Is>{}))...);
-}
-} // namespace detail
-template <typename F, typename X>
-__host__ __device__ constexpr auto transform_tuples(F f, const X& x)
-{
-    return detail::transform_tuples_impl(
-        f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
-}
-template <typename F, typename X, typename Y>
-__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y)
-{
-    return detail::transform_tuples_impl(
-        f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
-}
-template <typename F, typename X, typename Y, typename Z>
-__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y, const Z& z)
-{
-    return detail::transform_tuples_impl(
-        f, x, y, z, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
-}
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/tuple_helper.hpp
+++ b/composable_kernel/include/utility/tuple_helper.hpp
+#ifndef CK_TUPLE_HELPER_HPP
+#define CK_TUPLE_HELPER_HPP
+#include "tuple_helper.hpp"
+namespace ck {
+template <typename... Xs>
+__host__ __device__ constexpr auto make_tuple(Xs&&... xs)
+{
+    return Tuple<remove_cv_t<remove_reference_t<Xs>>...>(std::forward<Xs>(xs)...);
+}
+namespace detail {
+template <typename F, typename X, index_t... Is>
+__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence<Is...>)
+{
+    return make_tuple(f(x.At(Number<Is>{}))...);
+}
+template <typename F, typename X, typename Y, index_t... Is>
+__host__ __device__ constexpr auto
+transform_tuples_impl(F f, const X& x, const Y& y, Sequence<Is...>)
+{
+    return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}))...);
+}
+template <typename F, typename X, typename Y, typename Z, index_t... Is>
+__host__ __device__ constexpr auto
+transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, Sequence<Is...>)
+{
+    return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}), z.At(Number<Is>{}))...);
+}
+} // namespace detail
+template <typename F, typename X>
+__host__ __device__ constexpr auto transform_tuples(F f, const X& x)
+{
+    return detail::transform_tuples_impl(
+        f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
+}
+template <typename F, typename X, typename Y>
+__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y)
+{
+    return detail::transform_tuples_impl(
+        f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
+}
+template <typename F, typename X, typename Y, typename Z>
+__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y, const Z& z)
+{
+    return detail::transform_tuples_impl(
+        f, x, y, z, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
+}
+} // namespace ck
+#endif
--- a/script/cmake-rocm3.5.sh
+++ b/script/cmake-rocm3.5.sh
@@ -17,5 +17,5 @@ cmake
 ${MY_PROJECT_SOURCE}
 #-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0"       \
-#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps"       \
+#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD"       \
-#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps"       \
+#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD"       \