prototyping dynamic tensor descriptor

4388f572 · Chao Liu · b2098e70 · 4388f572 · 4388f572 · 4388f572
Commit 4388f572 authored Aug 16, 2020 by Chao Liu
7 changed files
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
@@ -12,16 +12,16 @@ namespace ck {
 template <index_t BlockSize>
 struct DummyDynamicTransform
 {
-    __device__ void Run(index_t* const __restrict__ p_wei_global,
+    __device__ void Run_v1(index_t* const __restrict__ p_wei_global,
-                        index_t* const __restrict__ p_in_global,
+                           index_t* const __restrict__ p_in_global,
-                        float* const __restrict__ p_out_global,
+                           float* const __restrict__ p_out_global,
-                        const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
+                           const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
-                        const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
+                           const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
-                        const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
+                           const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
-                        const Array<index_t, 2> conv_strides,
+                           const Array<index_t, 2> conv_strides,
-                        const Array<index_t, 2> conv_dilations,
+                           const Array<index_t, 2> conv_dilations,
-                        const Array<index_t, 2> in_left_pads,
+                           const Array<index_t, 2> in_left_pads,
-                        const Array<index_t, 2> in_right_pads) const
+                           const Array<index_t, 2> in_right_pads) const
    {
 #if 1
        const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
@@ -48,8 +48,8 @@ struct DummyDynamicTransform
        const index_t InRightPadH = in_right_pads[0];
        const index_t InRightPadW = in_right_pads[1];
 #else
-        const index_t N       = in_n_c_hi_wi_global_desc.GetLength(0);
+        const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
-        const index_t C       = in_n_c_hi_wi_global_desc.GetLength(1);
+        const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
        const index_t Y = 3;
        const index_t X = 3;
@@ -234,7 +234,7 @@ struct DummyDynamicTransform
                idx_low_diff -= negative_carry;
-                negative_carry = do_borrow ? 1 : negative_carry;
+                negative_carry        = do_borrow ? 1 : negative_carry;
 #endif
            };
@@ -267,9 +267,9 @@ struct DummyDynamicTransform
            const_tmp[i] = p_wei_global[i + 1];
        }
 #else
-        const_tmp[0]           = 0;
+        const_tmp[0]                  = 0;
-        const_tmp[1]           = 2;
+        const_tmp[1]                  = 2;
-        const_tmp[2]           = 2;
+        const_tmp[2]                  = 2;
 #endif
        // initialize idx
@@ -302,8 +302,9 @@ struct DummyDynamicTransform
            // PassThrough GemmN => GemmN
            f_lower_idx_diff_passthrough(idx_diff[16], idx_diff[19]);
-            // stage 3
+// stage 3
-            // Merge(C, Y, X) => GemmKTotal
+// Merge(C, Y, X) => GemmKTotal
+#if 0
            f_lower_idx_diff_merge_v2(idx_diff[10],
                                      idx_diff[11],
                                      idx_diff[13],
@@ -317,6 +318,47 @@ struct DummyDynamicTransform
                                      C,
                                      Y,
                                      X);
+#elif 0
+            index_t tmp               = idx_diff[15];
+            const index_t const_tmp_0 = tmp / (Y * X);
+            tmp -= const_tmp_0 * (Y * X);
+            const index_t const_tmp_1 = tmp / X;
+            const index_t const_tmp_2 = tmp - const_tmp_1 * X;
+            f_lower_idx_diff_merge_v2(idx_diff[10],
+                                      idx_diff[11],
+                                      idx_diff[13],
+                                      idx_diff[15],
+                                      idx[10],
+                                      idx[11],
+                                      idx[13],
+                                      const_tmp_0,
+                                      const_tmp_1,
+                                      const_tmp_2,
+                                      C,
+                                      Y,
+                                      X);
+#elif 1
+            index_t tmp               = idx_diff[15];
+            const index_t const_tmp_0 = __llvm_amdgcn_readfirstlane_i32(tmp / (Y * X));
+            tmp -= const_tmp_0 * (Y * X);
+            const index_t const_tmp_1 = __llvm_amdgcn_readfirstlane_i32(tmp / X);
+            const index_t const_tmp_2 = __llvm_amdgcn_readfirstlane_i32(tmp - const_tmp_1 * X);
+            f_lower_idx_diff_merge_v2(idx_diff[10],
+                                      idx_diff[11],
+                                      idx_diff[13],
+                                      idx_diff[15],
+                                      idx[10],
+                                      idx[11],
+                                      idx[13],
+                                      const_tmp_0,
+                                      const_tmp_1,
+                                      const_tmp_2,
+                                      C,
+                                      Y,
+                                      X);
+#endif
            // stage 2
            // PassThrough(N) => N
@@ -372,7 +414,7 @@ struct DummyDynamicTransform
            idx[13] += idx_diff[13];
            // padding check
-            bool is_in_bound = true;
+            bool is_in_bound                       = true;
 #else   // pad
            // offset
            idx[0] += idx_diff[0];
@@ -409,7 +451,39 @@ struct DummyDynamicTransform
        }
    }
-    __device__ void Run_(index_t* const __restrict__ p_wei_global,
+    __device__ void Run_v2(index_t* const __restrict__ p_wei_global,
+                           index_t* const __restrict__ p_in_global,
+                           float* const __restrict__ p_out_global,
+                           const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
+                           const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
+                           const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
+                           const Array<index_t, 2> conv_strides,
+                           const Array<index_t, 2> conv_dilations,
+                           const Array<index_t, 2> in_left_pads,
+                           const Array<index_t, 2> in_right_pads) const
+    {
+        Index idx_up;
+        idx_up(0) = in_n_c_hi_wi_global_desc.GetLength(0);
+        idx_up(1) = in_n_c_hi_wi_global_desc.GetLength(1);
+        idx_up(2) = in_n_c_hi_wi_global_desc.GetLength(2);
+        idx_up(3) = in_n_c_hi_wi_global_desc.GetLength(3);
+#if 0
+        constexpr auto trans = GetTransforms();
+        auto idx_low = trans[0]->CalculateLowerIndex(idx_up);
+#elif 1
+        constexpr DynamicCoordinateTransform* tran = &embed;
+        auto idx_low = tran->CalculateLowerIndex(idx_up);
+#endif
+        p_out_global[get_thread_local_1d_id()] = idx_low[0];
+    }
+    __device__ void Run(index_t* const __restrict__ p_wei_global,
                        index_t* const __restrict__ p_in_global,
                        float* const __restrict__ p_out_global,
                        const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
@@ -420,6 +494,16 @@ struct DummyDynamicTransform
                        const Array<index_t, 2> in_left_pads,
                        const Array<index_t, 2> in_right_pads) const
    {
+        Run_v2(p_wei_global,
+               p_in_global,
+               p_out_global,
+               wei_k_c_y_x_global_desc,
+               in_n_c_hi_wi_global_desc,
+               out_n_k_ho_wo_global_desc,
+               conv_strides,
+               conv_dilations,
+               in_left_pads,
+               in_right_pads);
    }
 };

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -5,18 +5,452 @@
 namespace ck {
-class DynamicTransformation
+struct DynamicPassThrough
 {
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    const index_t low_length_;
+    __host__ __device__ explicit constexpr DynamicPassThrough(const index_t& low_length)
+        : low_length_(low_length)
+    {
+    }
+    __host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr auto GetUpperIndex() { return UpperIndex({low_length_}); }
+    __host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        return idx_up;
+    }
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        return idx_up_diff;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    {
+        return true;
+    }
 };
-template<index_t N>
+template <bool SkipIsValidCheck = false>
-class DynamicEmbed : public DynamicTransformation
+struct DynamicLeftPad
 {
-    const array<idnex_t, N+1> coefficients_;
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    const index_t low_length_;
+    const index_t left_pad_;
+    __host__ __device__ explicit constexpr Pad(const index_t& low_length, const index_t& left_pad)
+        : low_length_{low_length}, left_pad_{left_pad}
+    {
+    }
+    __host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr auto GetUpperIndex()
+    {
+        return UpperIndex({low_length_ + left_pad_});
+    }
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        return LowerIndex{idx_up - lef_pad_};
+    }
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        return idx_up_diff;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return SkipIsValidCheck;
+    }
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up)
+    {
+        return SkipIsValidCheck || (idx_up[0] >= left_pad_);
+    }
+};
+template <bool SkipIsValidCheck = false>
+struct DynamicRightPad
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    const index_t low_length_;
+    const index_t right_pad_;
+    __host__ __device__ explicit constexpr Pad(const index_t& low_length, const index_t& right_pad)
+        : low_length_{low_length}, right_pad_{right_pad}
+    {
+    }
+    __host__ __device__ constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ constexpr auto GetUpperIndex()
+    {
+        return UpperIndex({low_length_ + right_pad_});
+    }
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        return idx_up;
+    }
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        return idx_up_diff;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return SkipIsValidCheck;
+    }
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up)
+    {
+        return SkipIsValidCheck || (idx_up[0] < low_length_);
+    }
+};
+// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] + coefficients[nDimUp]
+template <index_t NDimUp>
+struct DynamicEmbed
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<NDimUp>;
+    const index_t low_length_;
+    const UpperIndex up_lengths_;
+    const Array<index_t, NDimUp + 1> coefficients_;
+    __host__ __device__ explicit constexpr Embed(const index_t& low_length,
+                                                 const UpperIndex& up_lengths,
+                                                 const Array<index_t, NDimUp + 1>& coefficients)
+        : low_length_(low_length), up_lengths_(up_lengths), coefficients_(coefficients)
+    {
+        static_assert(up_lengths.GetSize() == nDimUp && coefficients.GetSize() == nDimUp + 1,
+                      "wrong! # of dimensions not consistent");
+    }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimUp; }
+    __host__ __device__ static constexpr auto GetUpperIndex() { return up_lengths_; }
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        index_t idx_low = coefficients_[NDimUp];
+        for(index_t i = 0; i < nDimUp; ++i)
+        {
+            idx_low += idx_up[i] * coefficients_[i];
+        }
+        return LowerIndex({idx_low});
+    }
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        index_t idx_low_diff = 0;
+        for(index_t i = 0; i < nDimUp; ++i)
+        {
+            idx_low_diff += idx_up_diff[i] * Coefficients{}[i];
+        }
+        return LowerIndex({idx_low_diff});
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    {
+        return true;
+    }
+};
+template <index_t NDimLow>
+struct DynamicMerge
+{
+    using LowerIndex = MultiIndex<NDimLow>;
+    using UpperIndex = MultiIndex<1>;
+    const LowerIndex low_lengths_;
+    const LowerIndex low_lengths_scan_;
+    const index_t up_length_;
+    __host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths_)
+        : low_lengths_(low_lengths), 
+          low_lengths_scan_(reverse_inclusive_scan_on_array(low_lengths, multiplies<index_t>()),
+          up_length(accumulate_on_array(low_lengths, multiplies<index_t>(), 1))
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() {
+        return NDimLow; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() {
+        return 1; }
+    __host__ __device__ constexpr auto GetUpperIndex() const
+    {
+        return UpperIndex({up_length_});
+    }
+    __host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) const
+    {
+        LowerIndex idx_low;
+        index_t itmp = idx_up[0];
+#pragma unroll
+        for(index_t i; i < NDimLow - 1; ++i)
+        {
+            idx_low(i) = itmp / low_lengths_scan_[i];
+            itmp -= idx_low[i] * low_lengths_scan_[i];
+        }
+        idx_low(NDimLow - 1) = itmp;
+#pragma unroll
+        return idx_low;
+    }
+    // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
+    // If idx_up_diff is known at compile-time, many calculations can be optimized
+    // away by compiler
+    // This function assume idx_low_old is not out-of-bound
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& idx_low_old)
+    {
+        LowerIndex idx_low_diff;
+        // CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
+        //   1) If idx_up_diff is known at compile-time, then idx_low_diff_const
+        //   can be calculated at compile-time.
+        //   2) If idx_up_diff is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then idx_low_diff_const also
+        //   doesn't change during the whole kernel execution. Compiler generated ISA should
+        //   only caclculate idx_low_diff once and save it durinng the whole kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be computed at
+        //   run-time each time this function is called, and can be very expensive.
+        LowerIndex idx_low_diff_const = CalculateLowerIndex(idx_up_diff);
+        // do carry check on each low dimension in reversed order
+        // do not need to check the first dimension
+        index_t carry = 0;
+#pragma unroll
+        for(index_t i = NDimLow - 1; i > 1; --i)
+        {
+            // this should be saved as well
+            index_t idx_low_length_minus_idx_low_diff_const =
+                low_lengths_[i] - idx_low_diff_const[i];
+#if 0
+            index_t idx_low_length_plus_idx_low_diff_const =
+                low_lengths_[i] + idx_low_diff_const[i];
+#endif
+            index_t idx_low_tmp[i] = idx_low_old[i] + carry;
+            bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_low_diff_const;
+#if 0
+            bool do_borrow = idx_low_tmp < -idx_low_diff_const[i];
+#endif
+            idx_low_diff[i] =
+                do_carry ? -idx_low_length_minus_idx_low_diff_const : idx_low_diff_const;
+#if 0
+            idx_low_diff[i] =
+                do_borrow ? idx_low_length_plus_idx_low_diff_const : idx_low_diff[i];
+#endif
+            idx_low_diff[i] += carry;
+            carry = do_carry ? 1 : 0;
+#if 0
+            carry = do_borrow ? -1 : carry;
+#endif
+        }
+        idx_low_diff[0] = idx_low_diff_const[0] + carry;
+        return idx_low_diff;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() {
+        return false; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    {
+        return true;
+    }
+};
+template <index_t NDimUp>
+struct DynamicUnMerge
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<NDimUp>;
+    const UpperIndex up_lengths_;
+    const UpperIndex up_lengths_scan_;
+    __host__ __device__ explicit constexpr DynamicUnMerge(const UpperIndex& up_lengths)
+        : up_lengths_(up_lengths), 
+          up_lengths_scan_(reverse_exclusive_scan_on_array(up_lengths, multiplies<index_t>(), index_t(1))
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() {
+        return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() {
+        return NDimUp; }
+    __host__ __device__ constexpr auto GetUpperIndex() const {
+        return up_lengths_; }
+    __host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) const
+    {
+        index_t idx_low = idx_up[NDimUp];
+#pragma unroll
+        for(index_t i = 0; i < NDimUp - 1; ++i)
+        {
+            idx_low += idx_up[i] * up_lengths_scan_[i];
+        }
+        return LowerIndex{idx_low};
+    }
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        return CalculateLowerIndex(idx_up_diff);
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() {
+        return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    {
+        return true;
+    }
+};
+struct DynamicFreeze
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<>;
+    const index_t low_idx_;
+    const index_t low_length_;
+    __host__ __device__ explicit constexpr Freeze(const index_t& low_idx, const index_t& low_length)
+        : low_idx_(low_idx), low_length_(low_length)
+    {
+    }
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 0; }
+    __host__ __device__ static constexpr auto GetUpperIndex() { return UpperIndex(); }
+    __host__ __device__ constexpr auto CalculateLowerIndex(const UpperIndex& /*idx_up*/) const
+    {
+        return LowerIndex({low_length_});
+    }
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& /* idx_up_diff */,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        return LowerIndex({0});
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
-    __host__ __device__ constexpr DynamicEmbed(coefficients)
+    __host__ __device__ static constexpr bool
-        : coefficients_(coefficients)
+    IsValidUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
    {
+        return true;
    }
 };

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
@@ -2,236 +2,13 @@
 #define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
 #include "common_header.hpp"
+#include "dynamic_multi_index_transform.hpp"
 namespace ck {
-template <index_t NDim>
+struct TensorDescriptor
-struct DynamicNativeTensorDescriptor
 {
-    using Index = MultiIndex<NDim>;
-    Array<index_t, NDim> lengths_;
-    Array<index_t, NDim> strides_;
-    index_t element_size_;
-    index_t element_space_;
-    template <typename Lengths, typename Strides>
-    __host__ __device__ constexpr DynamicNativeTensorDescriptor(const Lengths& lengths,
-                                                                const Strides& strides)
-        : lengths_(lengths), strides_(strides)
-    {
-        element_size_ = 1;
-        for(index_t i = 0; i < NDim; ++i)
-        {
-            element_size_ *= lengths_[i];
-        }
-        element_space_ = 1;
-        for(index_t i = 0; i < NDim; ++i)
-        {
-            element_space_ += (lengths_[i] - 1) * strides_[i];
-        }
-    }
-    __host__ __device__ static constexpr auto GetNumOfDimension() { return NDim; }
-    __host__ __device__ constexpr auto GetLength(const index_t& i) const { return lengths_[i]; }
-    __host__ __device__ constexpr auto GetStride(const index_t& i) const { return strides_[i]; }
-    __host__ __device__ constexpr auto GetLengths() const { return lengths_; }
-    __host__ __device__ constexpr auto GetStrides() const { return strides_; }
-    __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
-    __host__ __device__ constexpr auto GetElementSpace() const { return element_space_; }
-    __host__ __device__ constexpr auto CalculateOffset(const Index& idx) const
-    {
-        index_t offset = 0;
-#pragma unroll
-        for(index_t i = 0; i < NDim; ++i)
-        {
-            offset += idx[i] * strides_[i];
-        }
-        return offset;
-    }
-    __host__ __device__ constexpr auto CalculateOffsetDiff(const Index& idx_diff) const
-    {
-        index_t offset_diff = 0;
-#pragma unroll
-        for(index_t i = 0; i < NDim; ++i)
-        {
-            offset_diff += idx_diff[i] * strides_[i];
-        }
-        return offset_diff;
-    }
-    __host__ __device__ constexpr bool IsUpperIndexValid(const Index& idx) const
-    {
-        bool flag = true;
-#pragma unroll
-        for(index_t i = 0; i < NDim; ++i)
-        {
-            flag = flag && idx[i] >= 0 && idx[i] < lengths_[i];
-        }
-        return flag;
-    }
 };
-#if 0
-// Tensor descriptor for "transformed tensor"
-template <typename LowTensorDescriptor,
-          typename Transforms,          // Tuple<DynamicMultIndexTransforms,...>
-          typename LowDimensions,       // Tuple<Sequence<...>,...>
-          typename UpDimensions>        // Tuple<Sequence<...>,...>
-struct DynamicTransformedTensorDescriptor
-{
-    using Type = DynamicTransformedTensorDescriptor;
-    __host__ __device__ static constexpr auto GetNumOfLowerDimension()
-    {
-        // Here, we assume all lower-dimensions are active
-        // TODO: sanity-check all lower-dimension are indeed active
-        using duplicated_low_active_dims =
-            decltype(unpack(lambda_merge_sequences{}, LowDimensions{}));
-        using low_active_dims = typename sequence_unique_sort<duplicated_low_active_dims,
-                                                              math::less<index_t>,
-                                                              math::equal<index_t>>::type;
-        return low_active_dims::Size();
-    }
-    __host__ __device__ static constexpr auto GetNumOfUpperDimension()
-    {
-        using duplicated_up_active_dims =
-            decltype(unpack(lambda_merge_sequences{}, UpDimensions{}));
-        using up_active_dims = typename sequence_unique_sort<duplicated_up_active_dims,
-                                                             math::less<index_t>,
-                                                             math::equal<index_t>>::type;
-        return up_active_dims::Size();
-    }
-    static constexpr index_t ndim_up_  = GetNumOfUpperDimension();
-    static constexpr index_t ndim_low_ = GetNumOfLowerDimension();
-    static constexpr index_t num_transform_ = Transforms::Size();
-    using UpperIndex = MultiIndex<ndim_up_>;
-    using LowerIndex = MultiIndex<ndim_low_>;
-    const LowTensorDescriptor low_tensor_desc_;
-    const Transforms transforms_;
-    const LowDimensions low_dims_;
-    const UpDimensions up_dims_;
-    __host__ __device__ constexpr TransformedTensorDescriptor(const LowTensorDescriptor& low_tensor_desc,
-                                                              const Transforms& transforms)
-        : low_tensor_desc_(low_tensor_desc),
-          transforms_(transforms)
-    {
-    }
-    __host__ __device__ static constexpr auto GetNumOfDimension()
-    {
-        return GetNumOfUpperDimension();
-    }
-    __host__ __device__ constexpr auto GetLowerTensorDescriptor() const
-    {
-        return low_dims_;
-    }
-    __host__ __device__ constexpr auto GetUpperLengths() cons
-    {
-    }
-    __host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
-    __host__ __device__ constexpr auto GetLength(index_t i) const
-    {
-        return GetLengths()[i];
-    }
-    __host__ __device__ constexpr auto GetElementSize() const
-    {
-        index_t element_size = 1;
-        for(index_t i = 0; i < ndim_up_; ++i)
-        {
-            element_size *= GetLength(i);
-        }
-        return element_size;
-    }
-    __host__ __device__ constexpr auto GetElementSpace() const
-    {
-        return lower_tensor_desc_.GetElementSpace();
-    }
-    // TODO: right now return value is not constexpr because use of non-constexpr lambda
-    __host__ __device__ constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up) const
-    {
-        LowerIndex idx_low;
-        static_for<0, num_transform_, 1>{}([&](auto itran) {
-            constexpr auto tran = Transforms{}.At(itran);
-            const auto idx_up_part = pick_array_element(idx_up, UpDimensions{}.At(itran));
-            auto idx_low_part      = pick_array_element(idx_low, LowDimensions{}.At(itran));
-            // this assume each lower (single) index is only assocaited with one transformation,
-            //   which is required for index transformation, and has been checked during constructor
-            //   of TransformedTensorDescriptor
-            idx_low_part = tran.CalculateLowerIndex(to_array(idx_up_part));
-        });
-        return idx_low;
-    }
-    // TODO: right now return value is not constexpr because use of non-constepxr lambda
-    __host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
-        const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
-    {
-        LowerIndex idx_low_diff;
-        static_for<0, nTransform, 1>{}([&](auto itran) {
-            constexpr auto tran = Transforms{}.At(itran);
-            const auto idx_up_diff_part =
-                pick_array_element(idx_up_diff, UpDimensions{}.At(itran));
-            const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensions{}.At(itran));
-            const auto idx_low_old_part =
-                pick_array_element(idx_low_old, LowDimensions{}.At(itran));
-            auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensions{}.At(itran));
-            // this assume each lower (single) index is associated with only one transformation,
-            //   which is required for index transformation, and has been checked during constructor
-            //   of TransformedTensorDescriptor
-            idx_low_diff_part = tran.CalculateLowerIndexDiff(
-                to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
-        });
-        return idx_low_diff;
-    }
-    __host__ __device__ static constexpr index_t CalculateOffset(const UpperIndex& idx_up)
-    {
-        return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
-    }
-};
-#endif
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
@@ -15,6 +15,5 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L
    return DynamicNativeTensorDescriptor<Lengths::GetSize()>(lengths, strides);
 }
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/array.hpp
+++ b/composable_kernel/include/utility/array.hpp
@@ -13,7 +13,7 @@ struct Array
    using data_type = TData;
    // TODO: implement empty Array
-    index_t mData[NSize];
+    TData mData[NSize] = {0};
    __host__ __device__ explicit constexpr Array() {}
@@ -159,262 +159,5 @@ struct ArrayElementPicker
    Arr& mArray;
 };
-template <typename Arr, typename Picks>
-__host__ __device__ constexpr auto pick_array_element(Arr& a, Picks)
-{
-    return ArrayElementPicker<Arr, Picks>(a);
-}
-template <typename T>
-__host__ __device__ constexpr auto to_array(const T& x)
-{
-    Array<typename T::data_type, T::Size()> y;
-    static_for<0, T::Size(), 1>{}([&](auto i) { y.At(i) = x.At(i); });
-    return y;
-}
-// TODO: remove this
-template <index_t... Is>
-__host__ __device__ constexpr auto sequence2array(Sequence<Is...>)
-{
-    return Array<index_t, sizeof...(Is)>{Is...};
-}
-template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto make_zero_array()
-{
-    constexpr auto zero_sequence = typename uniform_sequence_gen<NSize, 0>::type{};
-    constexpr auto zero_array    = sequence2array(zero_sequence);
-    return zero_array;
-}
-template <typename TData, index_t NSize, index_t... IRs>
-__host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
-                                                               Sequence<IRs...> /*new2old*/)
-{
-    static_assert(NSize == sizeof...(IRs), "NSize not consistent");
-    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
-    return Array<TData, NSize>{old_array[IRs]...};
-}
-template <typename TData, index_t NSize, typename MapOld2New>
-struct lambda_reorder_array_given_old2new
-{
-    const Array<TData, NSize>& old_array;
-    Array<TData, NSize>& new_array;
-    __host__ __device__ constexpr lambda_reorder_array_given_old2new(
-        const Array<TData, NSize>& old_array_, Array<TData, NSize>& new_array_)
-        : old_array(old_array_), new_array(new_array_)
-    {
-    }
-    template <index_t IOldDim>
-    __host__ __device__ constexpr void operator()(Number<IOldDim>) const
-    {
-        TData old_data = old_array[IOldDim];
-        constexpr index_t INewDim = MapOld2New::At(Number<IOldDim>{});
-        new_array(Number<INewDim>{}) = old_data;
-    }
-};
-template <typename TData, index_t NSize, index_t... IRs>
-__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array,
-                                                               Sequence<IRs...> /*old2new*/)
-{
-    Array<TData, NSize> new_array;
-    static_assert(NSize == sizeof...(IRs), "NSize not consistent");
-    static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
-    static_for<0, NSize, 1>{}(
-        lambda_reorder_array_given_old2new<TData, NSize, Sequence<IRs...>>(old_array, new_array));
-    return new_array;
-}
-template <typename TData, index_t NSize, typename ExtractSeq>
-__host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_array, ExtractSeq)
-{
-    Array<TData, ExtractSeq::GetSize()> new_array;
-    constexpr index_t new_size = ExtractSeq::GetSize();
-    static_assert(new_size <= NSize, "wrong! too many extract");
-    static_for<0, new_size, 1>{}([&](auto I) { new_array(I) = old_array[ExtractSeq::At(I)]; });
-    return new_array;
-}
-// emulate constepxr lambda for array
-template <typename F, typename X, typename Y, typename Z>
-struct lambda_array_math
-{
-    const F& f;
-    const X& x;
-    const Y& y;
-    Z& z;
-    __host__ __device__ constexpr lambda_array_math(const F& f_, const X& x_, const Y& y_, Z& z_)
-        : f(f_), x(x_), y(y_), z(z_)
-    {
-    }
-    template <index_t IDim_>
-    __host__ __device__ constexpr void operator()(Number<IDim_>) const
-    {
-        constexpr auto IDim = Number<IDim_>{};
-        z(IDim)             = f(x[IDim], y[IDim]);
-    }
-};
-// Array = Array + Array
-template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
-{
-    Array<TData, NSize> result;
-    auto f = math::plus<index_t>{};
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
-    return result;
-}
-// Array = Array - Array
-template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData, NSize> b)
-{
-    Array<TData, NSize> result;
-    auto f = math::minus<index_t>{};
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
-    return result;
-}
-// Array += Array
-template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto operator+=(Array<TData, NSize>& a, Array<TData, NSize> b)
-{
-    a = a + b;
-    return a;
-}
-// Array -= Array
-template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto operator-=(Array<TData, NSize>& a, Array<TData, NSize> b)
-{
-    a = a - b;
-    return a;
-}
-// Array = Array + Sequence
-template <typename TData, index_t NSize, index_t... Is>
-__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is...> b)
-{
-    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
-    Array<TData, NSize> result;
-    auto f = math::plus<index_t>{};
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
-    return result;
-}
-// Array = Array - Sequence
-template <typename TData, index_t NSize, index_t... Is>
-__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is...> b)
-{
-    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
-    Array<TData, NSize> result;
-    auto f = math::minus<index_t>{};
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
-    return result;
-}
-// Array = Array * Sequence
-template <typename TData, index_t NSize, index_t... Is>
-__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
-{
-    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
-    Array<TData, NSize> result;
-    auto f = math::multiplies<index_t>{};
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
-    return result;
-}
-// Array = Sequence - Array
-template <typename TData, index_t NSize, index_t... Is>
-__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
-{
-    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
-    Array<TData, NSize> result;
-    auto f = math::minus<index_t>{};
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
-    return result;
-}
-// Array = Array * TData
-template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
-{
-    Array<TData, NSize> result;
-    for(index_t i = 0; i < NSize; ++i)
-    {
-        result(i) = a[i] * v;
-    }
-    return result;
-}
-template <typename TData, index_t NSize, typename Reduce>
-__host__ __device__ constexpr TData
-accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
-{
-    TData result = init;
-    static_assert(NSize > 0, "wrong");
-    static_for<0, NSize, 1>{}([&](auto I) { result = f(result, a[I]); });
-    return result;
-}
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/array_helper.hpp
+++ b/composable_kernel/include/utility/array_helper.hpp
+#ifndef CK_ARRAY_HELPER_HPP
+#define CK_ARRAY_HELPER_HPP
+#include "array.hpp"
+namespace ck {
+template <typename Arr, typename Picks>
+__host__ __device__ constexpr auto pick_array_element(Arr& a, Picks)
+{
+    return ArrayElementPicker<Arr, Picks>(a);
+}
+template <typename T>
+__host__ __device__ constexpr auto to_array(const T& x)
+{
+    Array<typename T::data_type, T::Size()> y;
+    static_for<0, T::Size(), 1>{}([&](auto i) { y.At(i) = x.At(i); });
+    return y;
+}
+template <typename TData, index_t NSize>
+__host__ __device__ constexpr auto make_zero_array()
+{
+    constexpr auto zero_sequence = typename uniform_sequence_gen<NSize, 0>::type{};
+    constexpr auto zero_array    = to_array(zero_sequence);
+    return zero_array;
+}
+template <typename TData, index_t NSize, index_t... IRs>
+__host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
+                                                               Sequence<IRs...> /*new2old*/)
+{
+    static_assert(NSize == sizeof...(IRs), "NSize not consistent");
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+    return Array<TData, NSize>{old_array[IRs]...};
+}
+template <typename TData, index_t NSize, typename MapOld2New>
+struct lambda_reorder_array_given_old2new
+{
+    const Array<TData, NSize>& old_array;
+    Array<TData, NSize>& new_array;
+    __host__ __device__ constexpr lambda_reorder_array_given_old2new(
+        const Array<TData, NSize>& old_array_, Array<TData, NSize>& new_array_)
+        : old_array(old_array_), new_array(new_array_)
+    {
+    }
+    template <index_t IOldDim>
+    __host__ __device__ constexpr void operator()(Number<IOldDim>) const
+    {
+        TData old_data = old_array[IOldDim];
+        constexpr index_t INewDim = MapOld2New::At(Number<IOldDim>{});
+        new_array(Number<INewDim>{}) = old_data;
+    }
+};
+template <typename TData, index_t NSize, index_t... IRs>
+__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array,
+                                                               Sequence<IRs...> /*old2new*/)
+{
+    Array<TData, NSize> new_array;
+    static_assert(NSize == sizeof...(IRs), "NSize not consistent");
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
+    static_for<0, NSize, 1>{}(
+        lambda_reorder_array_given_old2new<TData, NSize, Sequence<IRs...>>(old_array, new_array));
+    return new_array;
+}
+template <typename TData, index_t NSize, typename ExtractSeq>
+__host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_array, ExtractSeq)
+{
+    Array<TData, ExtractSeq::GetSize()> new_array;
+    constexpr index_t new_size = ExtractSeq::GetSize();
+    static_assert(new_size <= NSize, "wrong! too many extract");
+    static_for<0, new_size, 1>{}([&](auto I) { new_array(I) = old_array[ExtractSeq::At(I)]; });
+    return new_array;
+}
+// emulate constepxr lambda for array
+template <typename F, typename X, typename Y, typename Z>
+struct lambda_array_math
+{
+    const F& f;
+    const X& x;
+    const Y& y;
+    Z& z;
+    __host__ __device__ constexpr lambda_array_math(const F& f_, const X& x_, const Y& y_, Z& z_)
+        : f(f_), x(x_), y(y_), z(z_)
+    {
+    }
+    template <index_t IDim_>
+    __host__ __device__ constexpr void operator()(Number<IDim_>) const
+    {
+        constexpr auto IDim = Number<IDim_>{};
+        z(IDim)             = f(x[IDim], y[IDim]);
+    }
+};
+// Array = Array + Array
+template <typename TData, index_t NSize>
+__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
+{
+    Array<TData, NSize> result;
+    auto f = math::plus<index_t>{};
+    static_for<0, NSize, 1>{}(
+        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
+            f, a, b, result));
+    return result;
+}
+// Array = Array - Array
+template <typename TData, index_t NSize>
+__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData, NSize> b)
+{
+    Array<TData, NSize> result;
+    auto f = math::minus<index_t>{};
+    static_for<0, NSize, 1>{}(
+        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
+            f, a, b, result));
+    return result;
+}
+// Array += Array
+template <typename TData, index_t NSize>
+__host__ __device__ constexpr auto operator+=(Array<TData, NSize>& a, Array<TData, NSize> b)
+{
+    a = a + b;
+    return a;
+}
+// Array -= Array
+template <typename TData, index_t NSize>
+__host__ __device__ constexpr auto operator-=(Array<TData, NSize>& a, Array<TData, NSize> b)
+{
+    a = a - b;
+    return a;
+}
+// Array = Array + Sequence
+template <typename TData, index_t NSize, index_t... Is>
+__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is...> b)
+{
+    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
+    Array<TData, NSize> result;
+    auto f = math::plus<index_t>{};
+    static_for<0, NSize, 1>{}(
+        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
+            f, a, b, result));
+    return result;
+}
+// Array = Array - Sequence
+template <typename TData, index_t NSize, index_t... Is>
+__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is...> b)
+{
+    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
+    Array<TData, NSize> result;
+    auto f = math::minus<index_t>{};
+    static_for<0, NSize, 1>{}(
+        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
+            f, a, b, result));
+    return result;
+}
+// Array = Array * Sequence
+template <typename TData, index_t NSize, index_t... Is>
+__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
+{
+    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
+    Array<TData, NSize> result;
+    auto f = math::multiplies<index_t>{};
+    static_for<0, NSize, 1>{}(
+        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
+            f, a, b, result));
+    return result;
+}
+// Array = Sequence - Array
+template <typename TData, index_t NSize, index_t... Is>
+__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
+{
+    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
+    Array<TData, NSize> result;
+    auto f = math::minus<index_t>{};
+    static_for<0, NSize, 1>{}(
+        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
+            f, a, b, result));
+    return result;
+}
+// Array = Array * TData
+template <typename TData, index_t NSize>
+__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
+{
+    Array<TData, NSize> result;
+    for(index_t i = 0; i < NSize; ++i)
+    {
+        result(i) = a[i] * v;
+    }
+    return result;
+}
+template <typename TData, index_t NSize, typename Reduce>
+__host__ __device__ constexpr TData
+accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
+{
+    TData result = init;
+    static_assert(NSize > 0, "wrong");
+    static_for<0, NSize, 1>{}([&](auto I) { result = f(result, a[I]); });
+    return result;
+}
+template <typename TData, index_t NSize, typename Reduce>
+__host__ __device__ constexpr auto
+reverse_inclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData init)
+{
+    Array<TData, NSize> y;
+    TData r = init;
+#pragma unroll
+    for(index_t i = NSize - 1; i >= 0; --i)
+    {
+        r    = f(r, x[i]);
+        y(i) = r;
+    }
+    return y;
+}
+template <typename TData, index_t NSize, typename Reduce>
+__host__ __device__ constexpr auto
+reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData init)
+{
+    Array<TData, NSize> y;
+    TData r = init;
+#pragma unroll
+    for(index_t i = NSize - 1; i > 0; --i)
+    {
+        y(i) = r;
+        r    = f(r, x[i]);
+    }
+    y(i) = r;
+    return y;
+}
+} // namespace ck
+#endif
--- a/driver/src/conv_bwd_data_driver.cpp
+++ b/driver/src/conv_bwd_data_driver.cpp
@@ -197,7 +197,7 @@ int main(int argc, char* argv[])
    constexpr index_t X  = 3;
    using ConvStrides   = Sequence<2, 2>;
-    using ConvDilations = Sequence<1, 1>;
+    using ConvDilations = Sequence<2, 2>;
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
@@ -244,7 +244,7 @@ int main(int argc, char* argv[])
 #endif
    }
-#if 1
+#if 0
    device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
 #elif 0
    device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
@@ -253,17 +253,17 @@ int main(int argc, char* argv[])
 #elif 1
    device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
 #endif
-        (in_nchw_desc,
+    (in_nchw_desc,
-         in_nchw_device,
+     in_nchw_device,
-         wei_kcyx_desc,
+     wei_kcyx_desc,
-         wei_kcyx,
+     wei_kcyx,
-         out_nkhw_desc,
+     out_nkhw_desc,
-         out_nkhw,
+     out_nkhw,
-         ConvStrides{},
+     ConvStrides{},
-         ConvDilations{},
+     ConvDilations{},
-         LeftPads{},
+     LeftPads{},
-         RightPads{},
+     RightPads{},
-         nrepeat);
+     nrepeat);
    if(do_verification)
    {