Merge pull request #43 from ROCmSoftwarePlatform/develop

Merge develop into master

Merge pull request #43 from ROCmSoftwarePlatform/develop
Merge develop into master
38a90b6e · Chao Liu · GitHub · 88833bd9 · c3018794 · 38a90b6e
Unverified Commit 38a90b6e authored Oct 20, 2021 by Chao Liu Committed by GitHub Oct 20, 2021
20 changed files
--- a/composable_kernel/include/utility/data_type.hpp
+++ b/composable_kernel/include/utility/data_type.hpp
@@ -1008,20 +1008,27 @@ struct inner_product_with_conversion
 };
 template <typename T>
-struct NumericLimits;
+struct NumericLimits
+{
+    __host__ __device__ static constexpr T Min() { return std::numeric_limits<T>::min(); }
+    __host__ __device__ static constexpr T Max() { return std::numeric_limits<T>::max(); }
+    __host__ __device__ static constexpr T Lowest() { return std::numeric_limits<T>::lowest(); }
+};
 template <>
-struct NumericLimits<int32_t>
+struct NumericLimits<half_t>
 {
-    __host__ __device__ static constexpr int32_t Min()
+    static constexpr unsigned short binary_min    = 0x0400;
-    {
+    static constexpr unsigned short binary_max    = 0x7BFF;
-        return std::numeric_limits<int32_t>::min();
+    static constexpr unsigned short binary_lowest = 0xFBFF;
-    }
-    __host__ __device__ static constexpr int32_t Max()
+    __host__ __device__ static constexpr half_t Min() { return as_type<half_t>(binary_min); }
-    {
-        return std::numeric_limits<int32_t>::max();
+    __host__ __device__ static constexpr half_t Max() { return as_type<half_t>(binary_max); }
-    }
+    __host__ __device__ static constexpr half_t Lowest() { return as_type<half_t>(binary_lowest); }
 };
 } // namespace ck

--- a/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -38,6 +38,10 @@ struct DynamicBuffer
        return BufferAddressSpace;
    }
+    __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+    __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
    template <typename X,
              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                         typename scalar_type<remove_cvref_t<T>>::type>::value,

--- a/composable_kernel/include/utility/reduction_common.hpp
+++ b/composable_kernel/include/utility/reduction_common.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_COMMON_HPP
+#define CK_REDUCTION_COMMON_HPP
+#include "reduction_enums.hpp"
+namespace ck {
+struct float_equal_one
+{
+    template <class T>
+    __device__ inline bool operator()(T x)
+    {
+        return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
+    };
+};
+struct float_equal_zero
+{
+    template <class T>
+    __device__ inline bool operator()(T x)
+    {
+        return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
+    };
+};
+}; // end of namespace ck
+#endif
--- a/composable_kernel/include/utility/reduction_enums.hpp
+++ b/composable_kernel/include/utility/reduction_enums.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_ENUMS_HPP
+#define CK_REDUCTION_ENUMS_HPP
+namespace ck {
+enum class ReduceTensorOp_t
+{
+    ADD   = 0,
+    MUL   = 1,
+    MIN   = 2,
+    MAX   = 3,
+    AMAX  = 4,
+    AVG   = 5,
+    NORM1 = 6,
+    NORM2 = 7,
+    // MUL_NO_ZEROS = 8,
+};
+enum class NanPropagation_t
+{
+    NOT_PROPAGATE_NAN = 0,
+    PROPAGATE_NAN     = 1,
+};
+enum class ReduceTensorIndices_t
+{
+    NO_INDICES        = 0,
+    FLATTENED_INDICES = 1,
+};
+enum class IndicesType_t
+{
+    INDICES_32BIT = 0,
+    INDICES_64BIT = 1,
+    INDICES_16BIT = 2,
+    INDICES_8BIT  = 3,
+};
+}; // end of namespace ck
+#endif
--- a/composable_kernel/include/utility/reduction_functions_binop.hpp
+++ b/composable_kernel/include/utility/reduction_functions_binop.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_FUNCTIONS_BINOP_HPP
+#define CK_REDUCTION_FUNCTIONS_BINOP_HPP
+#include "data_type.hpp"
+#include "reduction_common.hpp"
+#include "reduction_operator.hpp"
+namespace ck {
+namespace detail {
+static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
+template <NanPropagation_t nanPropaOpt, typename opReduce, typename compType>
+struct binop_with_nan_check;
+template <typename opReduce, typename compType>
+struct binop_with_nan_check<NanPropagation_t::NOT_PROPAGATE_NAN, opReduce, compType>
+{
+    // cppcheck-suppress constParameter
+    __device__ static inline void calculate(compType& accuVal, compType currVal)
+    {
+        opReduce{}(accuVal, currVal);
+    };
+    // The method is called when the opReduce is indexable and the user asked for indices
+    __device__ static inline void
+    // cppcheck-suppress constParameter
+    calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
+    {
+        bool changed = false;
+        opReduce{}(accuVal, currVal, changed);
+        if(changed)
+            accuIndex = currIndex;
+    };
+};
+template <typename opReduce, typename compType>
+struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
+{
+    __device__ static inline void calculate(compType& accuVal, compType currVal)
+    {
+        if(isnan(currVal))
+            accuVal = currVal;
+        else
+            opReduce{}(accuVal, currVal);
+    };
+    // The method is called when the opReduce is indexable and the user asked for indices
+    __device__ static inline void
+    calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
+    {
+        if(isnan(currVal))
+        {
+            accuVal   = currVal;
+            accuIndex = currIndex;
+        }
+        else
+        {
+            bool changed = false;
+            opReduce{}(accuVal, currVal, changed);
+            if(changed)
+                accuIndex = currIndex;
+        }
+    };
+};
+}; // namespace detail
+}; // end of namespace ck
+#endif
--- a/composable_kernel/include/utility/reduction_operator.hpp
+++ b/composable_kernel/include/utility/reduction_operator.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef CK_REDUCTION_OPERATOR_HPP
+#define CK_REDUCTION_OPERATOR_HPP
+#include "reduction_common.hpp"
+namespace ck {
+namespace reduce {
+// Every binary operator used in reduction is represented by a templated functor class. Each functor
+// class must provide at least
+// three members:
+// 1) GetReductionZeroVal() -- the interface to return the "identity element" for the binary
+// operator, "identity element" is the unique
+//                    element in the algebraic space that doesn't affect the value of other elements
+//                    when operated against them, and the concept is similar to zero vector in
+//                    vector space
+//                    (http://pages.cs.wisc.edu/~matthewb/pages/notes/pdf/linearalgebra/VectorSpaces.pdf).
+// 2) indexable -- boolean value indicating whether indices of the operated elements could be
+// recorded. Usually, Min/Max operator could
+//                 need to record the indices of elements. For operator like Add/Mul, no need to
+//                 record the indices.
+// 3) operator() -- the first argument of the operator must be both an input & output, and the
+// corresponding variable usually stores
+//                  the accumulated result of many operator() calls; the second argument is only an
+//                  input. For indexable binary
+//                  operator, the second version of operator() has third argument (which is an
+//                  output) to indicate whether the
+//                  accumulated value (the first argument) has changed, in which case the recorded
+//                  accumulated index also need be
+//                  changed.
+template <class T>
+struct Add
+{
+    using dataType = T;
+    __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
+    __device__ inline constexpr void operator()(T& a, T b) const { a = a + b; }
+    static constexpr bool indexable = false;
+};
+template <class T>
+struct Mul
+{
+    using dataType = T;
+    __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); };
+    __device__ inline constexpr void operator()(T& a, T b) const { a = a * b; }
+    static constexpr bool indexable = false;
+};
+template <class T>
+struct Max
+{
+    using dataType = T;
+    __device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::Lowest(); };
+    __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        if(a < b)
+            a = b;
+    }
+    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    {
+        if(a < b)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
+    static constexpr bool indexable = true;
+};
+template <class T>
+struct Min
+{
+    using dataType = T;
+    __device__ static constexpr T GetReductionZeroVal() { return NumericLimits<T>::Max(); };
+    __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        if(a > b)
+            a = b;
+    }
+    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    {
+        if(a > b)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
+    static constexpr bool indexable = true;
+};
+template <class T>
+struct AMax
+{
+    using dataType = T;
+    __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); };
+    __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        if(a < b)
+            a = b;
+    }
+    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    {
+        if(a < b)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
+    static constexpr bool indexable = true;
+};
+// Unary operators are usually called element-wisely before the reduction is executed on the
+// elements.
+// They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
+template <class T, bool hasDividing>
+struct unary_identic
+{
+    __device__ unary_identic(const int divider = 1)
+    {
+        scaler = 1.0f / static_cast<float>(divider);
+    };
+    __device__ inline constexpr T operator()(T a) const { return a * type_convert<T>{}(scaler); };
+    float scaler = 1.0f;
+};
+template <class T>
+struct unary_identic<T, false>
+{
+    __device__ unary_identic(const int divider = 1) { (void)divider; };
+    __device__ inline constexpr T operator()(T a) const { return a; };
+};
+template <class T, bool hasDividing>
+struct unary_square
+{
+    __device__ unary_square(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
+    __device__ inline constexpr T operator()(T a) const
+    {
+        a = a * a;
+        return a * type_convert<T>{}(scaler);
+    };
+    float scaler = 1.0f;
+};
+template <class T>
+struct unary_square<T, false>
+{
+    __device__ unary_square(const int divider = 1) { (void)divider; };
+    __device__ inline constexpr T operator()(T a) const { return a * a; };
+};
+template <class T, bool hasDividing>
+struct unary_abs
+{
+    __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
+    __device__ inline constexpr T operator()(T a) const
+    {
+        a = abs(a);
+        return a * type_convert<T>{}(scaler);
+    };
+    float scaler = 1.0f;
+};
+template <class T>
+struct unary_abs<T, false>
+{
+    __device__ unary_abs(const int divider = 1) { (void)divider; };
+    __device__ inline constexpr T operator()(T a) const { return abs(a); };
+};
+// We know for sure that 4.0 has __habs(), but 3.0 does not have it.
+// Let's assume that __habs() exists since 3.5.
+#if HIP_PACKAGE_VERSION_FLAT < 3005000000
+inline __device__ __half __habs(__half x)
+{
+    union
+    {
+        __half half;
+        unsigned short u16;
+    } val;
+    val.half = x;
+    val.u16  = val.u16 & 0x7fff;
+    return val.half;
+}
+#endif
+template <bool hasDividing>
+struct unary_abs<half_t, hasDividing>
+{
+    __device__ unary_abs(const int divider = 1) { scaler = 1.0f / static_cast<float>(divider); };
+    __device__ inline half_t operator()(half_t a) const
+    {
+        a = static_cast<half_t>(__habs(a));
+        return a * type_convert<half_t>{}(scaler);
+    };
+    float scaler = 1.0f;
+};
+template <>
+struct unary_abs<half_t, false>
+{
+    __device__ unary_abs(const int divider = 1) { (void)divider; };
+    __device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(__habs(a)); };
+};
+template <class T>
+struct unary_sqrt
+{
+    __device__ unary_sqrt(const int divider = 1) { (void)divider; };
+    __device__ inline T operator()(T a) const { return sqrtf(a); };
+};
+template <>
+struct unary_sqrt<half_t>
+{
+    __device__ unary_sqrt(const int divider = 1) { (void)divider; };
+    __device__ inline half_t operator()(half_t a) const { return static_cast<half_t>(hsqrt(a)); };
+};
+}; // end of namespace reduce
+// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
+// respective functor classes.
+// The "GetReductionZeroVal()" interface and boolean member "indexable" are also provided in
+// reduce_binary_operactor for
+// easier checking by the upper-layer codes in the kernels.
+template <typename T, ReduceTensorOp_t op>
+struct reduce_binary_operator;
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::ADD>
+{
+    using opType   = reduce::Add<T>;
+    using dataType = T;
+    static constexpr bool indexable = reduce::Add<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::MUL>
+{
+    using opType   = reduce::Mul<T>;
+    using dataType = T;
+    static constexpr bool indexable = reduce::Mul<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::MIN>
+{
+    using opType   = reduce::Min<T>;
+    using dataType = T;
+    static constexpr bool indexable = reduce::Min<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::MAX>
+{
+    using opType   = reduce::Max<T>;
+    using dataType = T;
+    static constexpr bool indexable = reduce::Max<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::AMAX>
+{
+    using opType   = reduce::AMax<T>;
+    using dataType = T;
+    static constexpr bool indexable = reduce::Max<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::AVG>
+{
+    using opType   = reduce::Add<T>;
+    using dataType = T;
+    static constexpr bool indexable = reduce::Add<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::NORM1>
+{
+    using opType   = reduce::Add<T>;
+    using dataType = T;
+    static constexpr bool indexable = reduce::Add<T>::indexable;
+};
+template <typename T>
+struct reduce_binary_operator<T, ReduceTensorOp_t::NORM2>
+{
+    using opType   = reduce::Add<T>;
+    using dataType = T;
+    static constexpr bool indexable = reduce::Add<T>::indexable;
+};
+// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
+// functor classes.
+// The two unary functors are called before and afer the Reduction is executed respectively
+template <typename T, ReduceTensorOp_t op, bool isFirsReduce, bool isLastReduce>
+struct reduce_unary_operator
+{
+    using preUnaryOp = reduce::unary_identic<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, false>;
+};
+template <typename T, bool isFirstReduce>
+struct reduce_unary_operator<T, ReduceTensorOp_t::AVG, isFirstReduce, true>
+{
+    using preUnaryOp = reduce::unary_identic<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, true>;
+};
+template <typename T, bool isLastReduce>
+struct reduce_unary_operator<T, ReduceTensorOp_t::NORM1, true, isLastReduce>
+{
+    using preUnaryOp = reduce::unary_abs<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, false>;
+};
+template <typename T, bool isLastReduce>
+struct reduce_unary_operator<T, ReduceTensorOp_t::AMAX, true, isLastReduce>
+{
+    using preUnaryOp = reduce::unary_abs<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, false>;
+};
+template <typename T>
+struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, false>
+{
+    using preUnaryOp = reduce::unary_square<T, false>;
+    using posUnaryOp = reduce::unary_identic<T, false>;
+};
+template <typename T>
+struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, true, true>
+{
+    using preUnaryOp = reduce::unary_square<T, false>;
+    using posUnaryOp = reduce::unary_sqrt<T>;
+};
+template <typename T>
+struct reduce_unary_operator<T, ReduceTensorOp_t::NORM2, false, true>
+{
+    using preUnaryOp = reduce::unary_identic<T, false>;
+    using posUnaryOp = reduce::unary_sqrt<T>;
+};
+} // end of namespace ck
+#endif
--- a/composable_kernel/include/utility/static_buffer.hpp
+++ b/composable_kernel/include/utility/static_buffer.hpp
@@ -55,6 +55,98 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
    __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
 };
+template <AddressSpaceEnum_t BufferAddressSpace,
+          typename T,
+          index_t N,
+          bool InvalidElementUseNumericalZeroValue>
+struct StaticBufferV2 : public StaticallyIndexedArray<T, N>
+{
+    using type = T;
+    using base = StaticallyIndexedArray<T, N>;
+    using VecBaseType = typename T::d1_t;
+    __host__ __device__ static constexpr index_t GetVectorSize()
+    {
+        return sizeof(typename T::type) / sizeof(VecBaseType);
+    }
+    static constexpr index_t vector_size = GetVectorSize();
+    VecBaseType invalid_element_value_ = VecBaseType{0};
+    T invalid_vec_value_ = T{0};
+    __host__ __device__ constexpr StaticBufferV2() : base{} {}
+    __host__ __device__ constexpr StaticBufferV2(VecBaseType invalid_element_value)
+        : base{},
+          invalid_vec_value_{invalid_element_value},
+          invalid_element_value_{invalid_element_value}
+    {
+    }
+    __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
+    {
+        return BufferAddressSpace;
+    }
+    template <index_t I>
+    __host__ __device__ constexpr auto& GetVector(Number<I> vec_id)
+    {
+        return this->At(vec_id);
+    }
+    template <index_t I>
+    __host__ __device__ constexpr const auto& GetVector(Number<I> vec_id) const
+    {
+        return this->At(vec_id);
+    }
+    template <index_t I>
+    __host__ __device__ constexpr auto& GetElement(Number<I> i, bool)
+    {
+        constexpr auto vec_id  = Number<i / vector_size>{};
+        constexpr auto vec_off = Number<i % vector_size>{};
+        return this->At(vec_id).template AsType<VecBaseType>()(vec_off);
+    }
+    template <index_t I>
+    __host__ __device__ constexpr auto GetElement(Number<I> i, bool is_valid_element) const
+    {
+        constexpr auto vec_id  = Number<i / vector_size>{};
+        constexpr auto vec_off = Number<i % vector_size>{};
+        if constexpr(InvalidElementUseNumericalZeroValue)
+        {
+            return is_valid_element ? this->At(vec_id).template AsType<VecBaseType>()[vec_off]
+                                    : VecBaseType{0};
+        }
+        else
+        {
+            return is_valid_element ? this->At(vec_id).template AsType<VecBaseType>()[vec_off]
+                                    : invalid_element_value_;
+        }
+    }
+    template <index_t I>
+    __host__ __device__ constexpr auto operator[](Number<I> i) const
+    {
+        return GetElement(i, true);
+    }
+    template <index_t I>
+    __host__ __device__ constexpr auto& operator()(Number<I> i)
+    {
+        return GetElement(i, true);
+    }
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
+    __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
+};
 template <AddressSpaceEnum_t BufferAddressSpace, typename T, index_t N>
 __host__ __device__ constexpr auto make_static_buffer(Number<N>)
 {

--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_blockwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+    (void)BlkGroupSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    auto dstDesc       = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const auto one_dim_srcDesc = transform_tensor_descriptor(
+        srcDesc,
+        make_tuple(make_merge_transform(tupleSrcLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    auto src2dDesc = transform_tensor_descriptor(
+        one_dim_srcDesc,
+        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
+        make_tuple(Sequence<0>{}),
+        make_tuple(Sequence<0, 1>{}));
+    constexpr int invariantLen = 1;
+    const auto toReduceLen     = src2dDesc.GetLength(Number<1>{});
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if(get_thread_local_1d_id() == 0)
+        *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
+};
+template <index_t srcDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
+    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_src2dDesc =
+        transform_tensor_descriptor(ref_one_dim_srcDesc,
+                                    make_tuple(make_unmerge_transform(
+                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
+                                    make_tuple(Sequence<0>{}),
+                                    make_tuple(Sequence<0, 1>{}));
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dstDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
+};
+using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
+                                                                   srcDataType,
+                                                                   dstDataType,
+                                                                   compType,
+                                                                   decltype(src2dDesc),
+                                                                   decltype(dst1dDesc),
+                                                                   op,
+                                                                   nanPropaOpt,
+                                                                   reduceIndicesOpt,
+                                                                   true,
+                                                                   true,
+                                                                   GredAccessesPerThreadInBlock>;
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_blockwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
+constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
+using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
+using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+    (void)BlkGroupSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
+    const auto invariantDimLengths =
+        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
+    auto src2dDesc =
+        transform_tensor_descriptor(srcDesc,
+                                    make_tuple(make_merge_transform(invariantDimLengths),
+                                               make_merge_transform(toReduceDimLengths)),
+                                    make_tuple(invariantDims{}, toReduceDims{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if(get_thread_local_1d_id() == 0)
+        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+};
+template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_toReduceDimLengths =
+        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
+    static constexpr auto ref_invariantDimLengths =
+        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
+                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
+        make_tuple(invariantDims{}, toReduceDims{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
+                                                                   srcDataType,
+                                                                   dstDataType,
+                                                                   compType,
+                                                                   decltype(src2dDesc),
+                                                                   decltype(dst1dDesc),
+                                                                   op,
+                                                                   nanPropaOpt,
+                                                                   reduceIndicesOpt,
+                                                                   true,
+                                                                   true,
+                                                                   GredAccessesPerThreadInBlock>;
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_multiblock.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    auto dstDesc       = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const auto one_dim_srcDesc = transform_tensor_descriptor(
+        srcDesc,
+        make_tuple(make_merge_transform(tupleSrcLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    auto src2dDesc = transform_tensor_descriptor(
+        one_dim_srcDesc,
+        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
+        make_tuple(Sequence<0>{}),
+        make_tuple(Sequence<0, 1>{}));
+    constexpr int invariantLen = 1;
+    const auto toReduceLen     = src2dDesc.GetLength(Number<1>{});
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+    const index_t reduceSizePerBlock =
+        (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
+        copySliceLen;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if(get_thread_local_1d_id() == 0)
+        *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
+};
+template <index_t srcDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
+    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_src2dDesc =
+        transform_tensor_descriptor(ref_one_dim_srcDesc,
+                                    make_tuple(make_unmerge_transform(
+                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
+                                    make_tuple(Sequence<0>{}),
+                                    make_tuple(Sequence<0, 1>{}));
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dstDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
+};
+using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_dst_global;
+    (void)indices_global;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
+                                                                    srcDataType,
+                                                                    dstDataType,
+                                                                    compType,
+                                                                    decltype(src2dDesc),
+                                                                    decltype(dst1dDesc),
+                                                                    op,
+                                                                    nanPropaOpt,
+                                                                    reduceIndicesOpt,
+                                                                    GredAccessesPerThreadInBlock>;
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        BlkGroupSize,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<srcDataType* const __restrict__>(ws_buf1_global),
+        static_cast<int* const __restrict__>(ws_buf2_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_multiblock.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
+constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
+using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
+using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
+    const auto invariantDimLengths =
+        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
+    auto src2dDesc =
+        transform_tensor_descriptor(srcDesc,
+                                    make_tuple(make_merge_transform(invariantDimLengths),
+                                               make_merge_transform(toReduceDimLengths)),
+                                    make_tuple(invariantDims{}, toReduceDims{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+    const index_t reduceSizePerBlock =
+        (((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
+        copySliceLen;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if(get_thread_local_1d_id() == 0)
+        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+};
+template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_toReduceDimLengths =
+        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
+    static constexpr auto ref_invariantDimLengths =
+        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
+                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
+        make_tuple(invariantDims{}, toReduceDims{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_dst_global;
+    (void)indices_global;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
+                                                                    srcDataType,
+                                                                    dstDataType,
+                                                                    compType,
+                                                                    decltype(src2dDesc),
+                                                                    decltype(dst1dDesc),
+                                                                    op,
+                                                                    nanPropaOpt,
+                                                                    reduceIndicesOpt,
+                                                                    GredAccessesPerThreadInBlock>;
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        BlkGroupSize,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<srcDataType* const __restrict__>(ws_buf1_global),
+        static_cast<int* const __restrict__>(ws_buf2_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    auto dstDesc       = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const auto one_dim_srcDesc = transform_tensor_descriptor(
+        srcDesc,
+        make_tuple(make_merge_transform(tupleSrcLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    auto src2dDesc = transform_tensor_descriptor(
+        one_dim_srcDesc,
+        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
+        make_tuple(Sequence<0>{}),
+        make_tuple(Sequence<0, 1>{}));
+    constexpr int invariantLen = 1;
+    const auto toReduceLen     = src2dDesc.GetLength(Number<1>{});
+    constexpr auto copySliceLen = GredThreadBufferLength;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dstdDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
+    }
+};
+template <index_t srcDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
+    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_src2dDesc =
+        transform_tensor_descriptor(ref_one_dim_srcDesc,
+                                    make_tuple(make_unmerge_transform(
+                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
+                                    make_tuple(Sequence<0>{}),
+                                    make_tuple(Sequence<0, 1>{}));
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dstDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
+};
+using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 =
+    typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
+                                                                           srcDataType,
+                                                                           dstDataType,
+                                                                           compType,
+                                                                           decltype(src2dDesc),
+                                                                           decltype(dst1dDesc),
+                                                                           op,
+                                                                           nanPropaOpt,
+                                                                           reduceIndicesOpt,
+                                                                           true,
+                                                                           true,
+                                                                           GredThreadBufferLength>;
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_partial_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
+constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
+using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
+using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
+    const auto invariantDimLengths =
+        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
+    auto src2dDesc =
+        transform_tensor_descriptor(srcDesc,
+                                    make_tuple(make_merge_transform(invariantDimLengths),
+                                               make_merge_transform(toReduceDimLengths)),
+                                    make_tuple(invariantDims{}, toReduceDims{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+    constexpr auto copySliceLen = GredThreadBufferLength;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dst1dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+    }
+};
+template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_toReduceDimLengths =
+        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
+    static constexpr auto ref_invariantDimLengths =
+        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
+                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
+        make_tuple(invariantDims{}, toReduceDims{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
+                                                                           srcDataType,
+                                                                           dstDataType,
+                                                                           compType,
+                                                                           decltype(src2dDesc),
+                                                                           decltype(dst1dDesc),
+                                                                           op,
+                                                                           nanPropaOpt,
+                                                                           reduceIndicesOpt,
+                                                                           true,
+                                                                           true,
+                                                                           GredThreadBufferLength>;
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    auto dstDesc       = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const auto one_dim_srcDesc = transform_tensor_descriptor(
+        srcDesc,
+        make_tuple(make_merge_transform(tupleSrcLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    auto src2dDesc = transform_tensor_descriptor(
+        one_dim_srcDesc,
+        make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
+        make_tuple(Sequence<0>{}),
+        make_tuple(Sequence<0, 1>{}));
+    constexpr int invariantLen = 1;
+    const auto toReduceLen     = src2dDesc.GetLength(Number<1>{});
+    constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dstDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
+    }
+};
+template <index_t srcDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
+    static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_src2dDesc =
+        transform_tensor_descriptor(ref_one_dim_srcDesc,
+                                    make_tuple(make_unmerge_transform(
+                                        make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
+                                    make_tuple(Sequence<0>{}),
+                                    make_tuple(Sequence<0, 1>{}));
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dstDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
+};
+using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce =
+        GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
+                                                  srcDataType,
+                                                  dstDataType,
+                                                  compType,
+                                                  decltype(src2dDesc),
+                                                  decltype(dst1dDesc),
+                                                  op,
+                                                  nanPropaOpt,
+                                                  reduceIndicesOpt,
+                                                  true,
+                                                  true,
+                                                  GredAccessesPerThreadInWarp>;
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_warpwise_reduce_partial_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t srcDims = CK_PARAM_IN_DIMS;
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+constexpr index_t num_toReduceDims  = CK_PARAM_NUM_TOREDUCE_DIMS;
+constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
+using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
+using toReduceDims  = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int inLength0,
+                                                             int inLength1,
+                                                             int inLength2,
+                                                             int inLength3,
+                                                             int inLength4,
+                                                             int inLength5,
+                                                             int inStride0,
+                                                             int inStride1,
+                                                             int inStride2,
+                                                             int inStride3,
+                                                             int inStride4,
+                                                             int inStride5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
+    const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+    const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
+    const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
+    const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
+    const auto invariantDimLengths =
+        make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
+    auto src2dDesc =
+        transform_tensor_descriptor(srcDesc,
+                                    make_tuple(make_merge_transform(invariantDimLengths),
+                                               make_merge_transform(toReduceDimLengths)),
+                                    make_tuple(invariantDims{}, toReduceDims{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    const auto invariantLen = src2dDesc.GetLength(Number<0>{});
+    const auto toReduceLen  = src2dDesc.GetLength(Number<1>{});
+    constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dst1dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+    }
+};
+template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_toReduceDimLengths =
+        typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
+    static constexpr auto ref_invariantDimLengths =
+        typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
+    static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
+    static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
+    // don't have to use accurate strides to get an expected referrence type
+    static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
+    static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
+        make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
+    static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
+        ref_srcDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
+                   make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
+        make_tuple(invariantDims{}, toReduceDims{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
+    static constexpr auto ref_toReduceLen  = ref_src2dDesc.GetLength(Number<1>{});
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+};
+using refType_src2dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
+using refType_dst1dDesc =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded =
+    typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
+        refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
+                                                     int BlkGroupSize,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)BlkGroupSize;
+    (void)ws_buf2_bytes_offset;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce =
+        GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
+                                                  srcDataType,
+                                                  dstDataType,
+                                                  compType,
+                                                  decltype(src2dDesc),
+                                                  decltype(dst1dDesc),
+                                                  op,
+                                                  nanPropaOpt,
+                                                  reduceIndicesOpt,
+                                                  true,
+                                                  true,
+                                                  GredAccessesPerThreadInWarp>;
+    constexpr int RunId = need_indices ? 2 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(p_src_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(nullptr),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_blockwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+extern "C" __global__ void
+gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global)
+{
+    (void)GridSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
+    auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const index_t invariantLen = dstDesc.GetLength(Number<0>{});
+    const index_t toReduceLen  = BlkGroupSize;
+    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if(get_thread_local_1d_id() == 0)
+        *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
+};
+struct get_ref_desc_types
+{
+    static constexpr auto ref_tupleDstLengths = make_tuple(8);
+    static constexpr auto ref_dstDesc =
+        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
+    static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{});
+    static constexpr index_t ref_toReduceLen  = 8;
+    static constexpr auto ref_src2dDesc =
+        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dstDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+};
+using refType_src2dDesc           = typename get_ref_desc_types::refType_src2dDesc;
+using refType_dst1dDesc           = typename get_ref_desc_types::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 = typename get_ref_desc_types::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded    = typename get_ref_desc_types::refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_src_global;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
+                                                                   srcDataType,
+                                                                   dstDataType,
+                                                                   compType,
+                                                                   decltype(src2dDesc),
+                                                                   decltype(dst1dDesc),
+                                                                   op,
+                                                                   nanPropaOpt,
+                                                                   reduceIndicesOpt,
+                                                                   false,
+                                                                   true,
+                                                                   GredAccessesPerThreadInBlock>;
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+    constexpr int RunId = need_indices ? 3 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(ws_buf2_global),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_blockwise_reduce_partial_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_blockwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)GridSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    const index_t invariantLen = dst1dDesc.GetLength(Number<0>{});
+    const index_t toReduceLen  = BlkGroupSize;
+    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
+    constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pass_through_transform(invariantLen),
+                                                   make_pad_transform(toReduceLen, 0, srcPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if(get_thread_local_1d_id() == 0)
+        *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+};
+template <index_t dstDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_tupleDstLengths =
+        make_tuple_from_seq(typename uniform_sequence_gen<dstDims, 8>::type{});
+    static constexpr auto ref_dstDesc =
+        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(ref_tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{});
+    static constexpr index_t ref_toReduceLen  = 8;
+    static constexpr auto ref_src2dDesc =
+        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+    // used by the BlockWise and MultiBlock method
+    using refType_src2dDesc_padded_34 = decltype(
+        transform_tensor_descriptor(ref_src2dDesc,
+                                    make_tuple(make_pass_through_transform(ref_invariantLen),
+                                               make_pad_transform(ref_toReduceLen, 0, 2)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+};
+using refType_src2dDesc = typename get_ref_desc_types<dstDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<dstDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_34 =
+    typename get_ref_desc_types<dstDims>::refType_src2dDesc_padded_34;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<dstDims>::refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_src_global;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
+                                                                   srcDataType,
+                                                                   dstDataType,
+                                                                   compType,
+                                                                   decltype(src2dDesc),
+                                                                   decltype(dst1dDesc),
+                                                                   op,
+                                                                   nanPropaOpt,
+                                                                   reduceIndicesOpt,
+                                                                   false,
+                                                                   true,
+                                                                   GredAccessesPerThreadInBlock>;
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+    constexpr int RunId = need_indices ? 3 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(ws_buf2_global),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+using toReduceDims  = Sequence<CK_PARAM_TOREDUCE_DIMS>;
+using invariantDims = Sequence<CK_PARAM_INVARIANT_DIMS>; // this could be empty
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
+extern "C" __global__ void
+gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
+    auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const index_t invariantLen = dstDesc.GetLength(Number<0>{});
+    const index_t toReduceLen  = BlkGroupSize;
+    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
+    constexpr auto copySliceLen = GredThreadBufferLength;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dstDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
+    }
+};
+struct get_ref_desc_types
+{
+    static constexpr auto ref_tupleDstLengths = make_tuple(8);
+    static constexpr auto ref_dstDesc =
+        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
+    static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{});
+    static constexpr index_t ref_toReduceLen  = 8;
+    static constexpr auto ref_src2dDesc =
+        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dstDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+};
+using refType_src2dDesc           = typename get_ref_desc_types::refType_src2dDesc;
+using refType_dst1dDesc           = typename get_ref_desc_types::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 = typename get_ref_desc_types::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded    = typename get_ref_desc_types::refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_src_global;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
+                                                                           srcDataType,
+                                                                           dstDataType,
+                                                                           compType,
+                                                                           decltype(src2dDesc),
+                                                                           decltype(dst1dDesc),
+                                                                           op,
+                                                                           nanPropaOpt,
+                                                                           reduceIndicesOpt,
+                                                                           false,
+                                                                           true,
+                                                                           GredThreadBufferLength>;
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+    constexpr int RunId = need_indices ? 3 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(ws_buf2_global),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_threadwise_reduce_partial_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
+// helper functions using variadic template arguments
+template <index_t... Ns>
+__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+template <index_t arraySize>
+__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+template <index_t... Ns>
+__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
+{
+    return make_tuple(Ns...);
+};
+extern "C" __global__ void gridwise_generic_reduce_2_prepare(int GridSize,
+                                                             int BlkGroupSize,
+                                                             int outLength0,
+                                                             int outLength1,
+                                                             int outLength2,
+                                                             int outLength3,
+                                                             int outLength4,
+                                                             int outLength5,
+                                                             int outStride0,
+                                                             int outStride1,
+                                                             int outStride2,
+                                                             int outStride3,
+                                                             int outStride4,
+                                                             int outStride5,
+                                                             void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const int dstLengths[6] = {
+        outLength0, outLength1, outLength2, outLength3, outLength4, outLength5};
+    const int dstStrides[6] = {
+        outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
+    const auto tupleDstLengths = make_tuple_from_array(dstLengths, Number<dstDims>{});
+    const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
+    const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    auto dst1dDesc = transform_tensor_descriptor(
+        dstDesc,
+        make_tuple(make_merge_transform(tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    const index_t invariantLen = dst1dDesc.GetLength(Number<0>{});
+    const index_t toReduceLen  = BlkGroupSize;
+    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
+    constexpr auto copySliceLen = GredThreadBufferLength;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dst1dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
+    }
+};
+template <index_t dstDims>
+struct get_ref_desc_types
+{
+    static constexpr auto ref_tupleDstLengths =
+        make_tuple_from_seq(typename uniform_sequence_gen<dstDims, 8>::type{});
+    static constexpr auto ref_dstDesc =
+        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
+    static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
+        ref_dstDesc,
+        make_tuple(make_merge_transform(ref_tupleDstLengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+    static constexpr index_t ref_invariantLen = ref_dst1dDesc.GetLength(Number<0>{});
+    static constexpr index_t ref_toReduceLen  = 8;
+    static constexpr auto ref_src2dDesc =
+        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dst1dDesc);
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dst1dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+};
+using refType_src2dDesc = typename get_ref_desc_types<dstDims>::refType_src2dDesc;
+using refType_dst1dDesc = typename get_ref_desc_types<dstDims>::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 =
+    typename get_ref_desc_types<dstDims>::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded = typename get_ref_desc_types<dstDims>::refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_src_global;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
+                                                                           srcDataType,
+                                                                           dstDataType,
+                                                                           compType,
+                                                                           decltype(src2dDesc),
+                                                                           decltype(dst1dDesc),
+                                                                           op,
+                                                                           nanPropaOpt,
+                                                                           reduceIndicesOpt,
+                                                                           false,
+                                                                           true,
+                                                                           GredThreadBufferLength>;
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+    constexpr int RunId = need_indices ? 3 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(ws_buf2_global),
+        static_cast<int* const __restrict__>(indices_global));
+};
--- a/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp
+++ b/composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_second_call_warpwise_reduce_all_dims.cpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "config.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "data_type_enum_helper.hpp"
+#include "reduction_common.hpp"
+#include "gridwise_generic_2d_reduction_direct_warpwise.hpp"
+using namespace ck;
+using srcDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
+using dstDataType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
+using compType =
+    typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
+constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
+constexpr ReduceTensorOp_t op          = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
+constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
+                                             ? NanPropagation_t::NOT_PROPAGATE_NAN
+                                             : NanPropagation_t::PROPAGATE_NAN;
+constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
+                                                       ? ReduceTensorIndices_t::NO_INDICES
+                                                       : ReduceTensorIndices_t::FLATTENED_INDICES;
+constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
+constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
+constexpr bool indexable    = reduce_binary_operator<compType, op>::indexable;
+constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+constexpr index_t GredAccessesPerThreadInWarp = CK_PARAM_ACCESSES_PER_THREAD_INWARP; // tunable
+extern "C" __global__ void
+gridwise_generic_reduce_2_prepare(int GridSize, int BlkGroupSize, void* __restrict__ ws_global)
+{
+    (void)BlkGroupSize;
+    void* p_src2dDesc = ws_global;
+    void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
+    const auto tupleDstLengths = make_tuple(1);
+    const auto tupleDstStrides = make_tuple(1);
+    auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+    const index_t invariantLen = dstDesc.GetLength(Number<0>{});
+    const index_t toReduceLen  = BlkGroupSize;
+    auto src2dDesc = make_naive_tensor_descriptor_packed(make_tuple(invariantLen, toReduceLen));
+    constexpr auto copySliceLen = warpSize * GredAccessesPerThreadInWarp;
+    if constexpr(src2d_need_padding)
+    {
+        const auto srcPad1 = GridSize * BlockSize / warpSize - invariantLen;
+        const auto srcPad2 =
+            ((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
+        auto src2dDesc_2 =
+            transform_tensor_descriptor(src2dDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
+                                                   make_pad_transform(toReduceLen, 0, srcPad2)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
+    }
+    if constexpr(dst1d_need_padding)
+    {
+        const auto dstPad = GridSize * BlockSize / warpSize - invariantLen;
+        auto dst1dDesc_2 =
+            transform_tensor_descriptor(dstDesc,
+                                        make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
+    }
+    else
+    {
+        if(get_thread_local_1d_id() == 0)
+            *static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
+    }
+};
+struct get_ref_desc_types
+{
+    static constexpr auto ref_tupleDstLengths = make_tuple(8);
+    static constexpr auto ref_dstDesc =
+        make_naive_tensor_descriptor(ref_tupleDstLengths, ref_tupleDstLengths);
+    static constexpr index_t ref_invariantLen = ref_dstDesc.GetLength(Number<0>{});
+    static constexpr index_t ref_toReduceLen  = 8;
+    static constexpr auto ref_src2dDesc =
+        make_naive_tensor_descriptor_packed(make_tuple(ref_invariantLen, ref_toReduceLen));
+    using refType_src2dDesc = decltype(ref_src2dDesc);
+    using refType_dst1dDesc = decltype(ref_dstDesc);
+    // used by the DirectThreadWise and DirectWarpWise method
+    using refType_src2dDesc_padded_12 =
+        decltype(transform_tensor_descriptor(ref_src2dDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
+                                                        make_pad_transform(ref_toReduceLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                             make_tuple(Sequence<0>{}, Sequence<1>{})));
+    using refType_dst1dDesc_padded =
+        decltype(transform_tensor_descriptor(ref_dstDesc,
+                                             make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
+                                             make_tuple(Sequence<0>{}),
+                                             make_tuple(Sequence<0>{})));
+};
+using refType_src2dDesc           = typename get_ref_desc_types::refType_src2dDesc;
+using refType_dst1dDesc           = typename get_ref_desc_types::refType_dst1dDesc;
+using refType_src2dDesc_padded_12 = typename get_ref_desc_types::refType_src2dDesc_padded_12;
+using refType_dst1dDesc_padded    = typename get_ref_desc_types::refType_dst1dDesc_padded;
+template <bool need_padding>
+static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
+    else
+        return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
+};
+template <bool need_padding>
+static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
+{
+    if constexpr(need_padding)
+        return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
+    else
+        return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
+};
+extern "C" __global__ void gridwise_generic_reduce_2(int origReduceLen,
+                                                     float alpha,
+                                                     const void* __restrict__ p_src_global,
+                                                     float beta,
+                                                     void* __restrict__ p_dst_global,
+                                                     const void CONSTANT* ws_global,
+                                                     long ws_buf2_bytes_offset,
+                                                     void* __restrict__ indices_global)
+{
+    (void)p_src_global;
+    const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
+    const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
+    void* ws_buf1_global    = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
+    const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
+    const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
+    using gridwise_2d_reduce =
+        GridwiseReduction_xy_to_x_direct_warpwise<BlockSize,
+                                                  srcDataType,
+                                                  dstDataType,
+                                                  compType,
+                                                  decltype(src2dDesc),
+                                                  decltype(dst1dDesc),
+                                                  op,
+                                                  nanPropaOpt,
+                                                  reduceIndicesOpt,
+                                                  false,
+                                                  true,
+                                                  GredAccessesPerThreadInWarp>;
+    void* const ws_buf2_global =
+        ws_buf2_bytes_offset > 0
+            ? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
+            : nullptr;
+    constexpr int RunId = need_indices ? 3 : 1;
+    gridwise_2d_reduce::template Run<RunId>(
+        src2dDesc,
+        dst1dDesc,
+        origReduceLen,
+        alpha,
+        static_cast<const srcDataType* const __restrict__>(ws_buf1_global),
+        beta,
+        static_cast<dstDataType* const __restrict__>(p_dst_global),
+        static_cast<const int* const __restrict__>(ws_buf2_global),
+        static_cast<int* const __restrict__>(indices_global));
+};