Dynamic tensor descriptor (#24)

* support dynamic tensor descriptor * use buffer load OOB feature for padding case * add navi support * add int8x4 inference kernel Co-authored-by: Chao Liu <chao@ixt-rack-81.local.lan> Co-authored-by: Jing Zhang <jizhan@amd.com>

Dynamic tensor descriptor (#24)
* support dynamic tensor descriptor * use buffer load OOB feature for padding case * add navi support * add int8x4 inference kernel Co-authored-by: Chao Liu <chao@ixt-rack-81.local.lan> Co-authored-by: Jing Zhang <jizhan@amd.com>
fcbb9788 · Chao Liu · GitHub · bbcb67d0 · fcbb9788 · fcbb9788
Unverified Commit fcbb9788 authored Mar 25, 2021 by Chao Liu Committed by GitHub Mar 25, 2021
20 changed files
--- a/composable_kernel/include/utility/amd_llvm_intrinsic.hpp
+++ b/composable_kernel/include/utility/amd_llvm_intrinsic.hpp
+#ifndef CK_AMD_LLVM_INTRINSIC_HPP
+#define CK_AMD_LLVM_INTRINSIC_HPP
+#include "float_type.hpp"
+namespace ck {
+__device__ int32_t __llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.readfirstlane");
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/array.hpp
+++ b/composable_kernel/include/utility/array.hpp
 #ifndef CK_ARRAY_HPP
 #define CK_ARRAY_HPP
-#include "sequence.hpp"
 #include "functional2.hpp"
+#include "sequence.hpp"
 namespace ck {
 template <typename TData, index_t NSize>
 struct Array
 {
-    using type      = Array<TData, NSize>;
+    using type      = Array;
    using data_type = TData;
-    // TODO: implement empty Array
+    TData mData[NSize];
-    index_t mData[NSize];
-    __host__ __device__ explicit constexpr Array() {}
-    template <typename X, typename... Xs>
-    __host__ __device__ constexpr Array(X x, Xs... xs)
-        : mData{static_cast<TData>(x), static_cast<TData>(xs)...}
-    {
-        static_assert(sizeof...(Xs) + 1 == NSize, "wrong! size");
-    }
    __host__ __device__ static constexpr index_t Size() { return NSize; }
-    // TODO: remove
-    __host__ __device__ static constexpr index_t GetSize() { return Size(); }
-    template <index_t I>
-    __host__ __device__ constexpr const TData& At(Number<I>) const
-    {
-        static_assert(I < NSize, "wrong!");
-        return mData[I];
-    }
-    template <index_t I>
-    __host__ __device__ constexpr TData& At(Number<I>)
-    {
-        static_assert(I < NSize, "wrong!");
-        return mData[I];
-    }
    __host__ __device__ constexpr const TData& At(index_t i) const { return mData[i]; }
    __host__ __device__ constexpr TData& At(index_t i) { return mData[i]; }
-    template <typename I>
+    __host__ __device__ constexpr const TData& operator[](index_t i) const { return At(i); }
-    __host__ __device__ constexpr const TData& operator[](I i) const
-    {
-        return At(i);
-    }
-    template <typename I>
+    __host__ __device__ constexpr TData& operator()(index_t i) { return At(i); }
-    __host__ __device__ constexpr TData& operator()(I i)
-    {
-        return At(i);
-    }
    template <typename T>
-    __host__ __device__ constexpr type& operator=(const T& x)
+    __host__ __device__ constexpr auto operator=(const T& a)
-    {
-        static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = x[i]; });
-        return *this;
-    }
-    struct lambda_PushBack // emulate constexpr lambda
    {
-        const Array<TData, NSize>& old_array;
+        static_assert(T::Size() == Size(), "wrong! size not the same");
-        Array<TData, NSize + 1>& new_array;
-        __host__ __device__ constexpr lambda_PushBack(const Array<TData, NSize>& old_array_,
-                                                      Array<TData, NSize + 1>& new_array_)
-            : old_array(old_array_), new_array(new_array_)
-        {
-        }
-        template <index_t I>
-        __host__ __device__ constexpr void operator()(Number<I>) const
-        {
-            new_array(Number<I>{}) = old_array[I];
-        }
-    };
-    __host__ __device__ constexpr auto PushBack(TData x) const
-    {
-        Array<TData, NSize + 1> new_array;
-        static_for<0, NSize, 1>{}(lambda_PushBack(*this, new_array));
-        new_array(Number<NSize>{}) = x;
-        return new_array;
-    }
-};
-// Arr: Array
-// Picks: Sequence<...>
-template <typename Arr, typename Picks>
-struct ArrayElementPicker
-{
-    using type      = ArrayElementPicker;
-    using data_type = typename Arr::data_type;
-    __host__ __device__ constexpr ArrayElementPicker() = delete;
-    __host__ __device__ explicit constexpr ArrayElementPicker(Arr& array) : mArray{array}
-    {
-        constexpr index_t imax = reduce_on_sequence(Picks{}, math::maxer<index_t>{}, Number<0>{});
-        static_assert(imax < Arr::Size(), "wrong! exceeding # array element");
-    }
-    __host__ __device__ static constexpr auto Size() { return Picks::Size(); }
-    template <index_t I>
-    __host__ __device__ constexpr const data_type& At(Number<I>) const
-    {
-        static_assert(I < Size(), "wrong!");
-        constexpr auto IP = Picks{}[I];
-        return mArray[IP];
-    }
-    template <index_t I>
-    __host__ __device__ constexpr data_type& At(Number<I>)
-    {
-        static_assert(I < Size(), "wrong!");
-        constexpr auto IP = Picks{}[I];
-        return mArray(IP);
-    }
-    template <typename I>
-    __host__ __device__ constexpr const data_type& operator[](I i) const
-    {
-        return At(i);
-    }
-    template <typename I>
-    __host__ __device__ constexpr data_type& operator()(I i)
-    {
-        return At(i);
-    }
-    template <typename T>
-    __host__ __device__ constexpr type& operator=(const T& a)
-    {
        static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = a[i]; });
        return *this;
    }
-    Arr& mArray;
-};
-template <typename Arr, typename Picks>
-__host__ __device__ constexpr auto pick_array_element(Arr& a, Picks)
-{
-    return ArrayElementPicker<Arr, Picks>(a);
-}
-template <typename T>
-__host__ __device__ constexpr auto to_array(const T& x)
-{
-    Array<typename T::data_type, T::Size()> y;
-    static_for<0, T::Size(), 1>{}([&](auto i) { y.At(i) = x.At(i); });
-    return y;
-}
-// TODO: remove this
-template <index_t... Is>
-__host__ __device__ constexpr auto sequence2array(Sequence<Is...>)
-{
-    return Array<index_t, sizeof...(Is)>{Is...};
-}
-template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto make_zero_array()
-{
-    constexpr auto zero_sequence = typename uniform_sequence_gen<NSize, 0>::type{};
-    constexpr auto zero_array    = sequence2array(zero_sequence);
-    return zero_array;
-}
-template <typename TData, index_t NSize, index_t... IRs>
-__host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData, NSize>& old_array,
-                                                               Sequence<IRs...> /*new2old*/)
-{
-    static_assert(NSize == sizeof...(IRs), "NSize not consistent");
-    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
-    return Array<TData, NSize>{old_array[IRs]...};
-}
-template <typename TData, index_t NSize, typename MapOld2New>
-struct lambda_reorder_array_given_old2new
-{
-    const Array<TData, NSize>& old_array;
-    Array<TData, NSize>& new_array;
-    __host__ __device__ constexpr lambda_reorder_array_given_old2new(
-        const Array<TData, NSize>& old_array_, Array<TData, NSize>& new_array_)
-        : old_array(old_array_), new_array(new_array_)
-    {
-    }
-    template <index_t IOldDim>
-    __host__ __device__ constexpr void operator()(Number<IOldDim>) const
-    {
-        TData old_data = old_array[IOldDim];
-        constexpr index_t INewDim = MapOld2New::At(Number<IOldDim>{});
-        new_array(Number<INewDim>{}) = old_data;
-    }
 };
-template <typename TData, index_t NSize, index_t... IRs>
+// empty Array
-__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array,
+template <typename TData>
-                                                               Sequence<IRs...> /*old2new*/)
+struct Array<TData, 0>
 {
-    Array<TData, NSize> new_array;
+    using type      = Array;
+    using data_type = TData;
-    static_assert(NSize == sizeof...(IRs), "NSize not consistent");
-    static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
-    static_for<0, NSize, 1>{}(
-        lambda_reorder_array_given_old2new<TData, NSize, Sequence<IRs...>>(old_array, new_array));
-    return new_array;
-}
-template <typename TData, index_t NSize, typename ExtractSeq>
-__host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_array, ExtractSeq)
-{
-    Array<TData, ExtractSeq::GetSize()> new_array;
-    constexpr index_t new_size = ExtractSeq::GetSize();
-    static_assert(new_size <= NSize, "wrong! too many extract");
-    static_for<0, new_size, 1>{}([&](auto I) { new_array(I) = old_array[ExtractSeq::At(I)]; });
-    return new_array;
-}
-// emulate constepxr lambda for array
-template <typename F, typename X, typename Y, typename Z>
-struct lambda_array_math
-{
-    const F& f;
-    const X& x;
-    const Y& y;
-    Z& z;
-    __host__ __device__ constexpr lambda_array_math(const F& f_, const X& x_, const Y& y_, Z& z_)
-        : f(f_), x(x_), y(y_), z(z_)
-    {
-    }
-    template <index_t IDim_>
+    __host__ __device__ static constexpr index_t Size() { return 0; }
-    __host__ __device__ constexpr void operator()(Number<IDim_>) const
-    {
-        constexpr auto IDim = Number<IDim_>{};
-        z(IDim)             = f(x[IDim], y[IDim]);
-    }
 };
-// Array = Array + Array
+template <typename X, typename... Xs>
-template <typename TData, index_t NSize>
+__host__ __device__ constexpr auto make_array(X&& x, Xs&&... xs)
-__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, NSize> b)
 {
-    Array<TData, NSize> result;
+    using data_type = remove_cv_t<remove_reference_t<X>>;
+    return Array<data_type, sizeof...(Xs) + 1>{{std::forward<X>(x), std::forward<Xs>(xs)...}};
-    auto f = math::plus<index_t>{};
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
-    return result;
 }
-// Array = Array - Array
+// make empty array
-template <typename TData, index_t NSize>
+template <typename X>
-__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData, NSize> b)
+__host__ __device__ constexpr auto make_array()
-{
-    Array<TData, NSize> result;
-    auto f = math::minus<index_t>{};
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
-    return result;
-}
-// Array += Array
-template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto operator+=(Array<TData, NSize>& a, Array<TData, NSize> b)
-{
-    a = a + b;
-    return a;
-}
-// Array -= Array
-template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto operator-=(Array<TData, NSize>& a, Array<TData, NSize> b)
 {
-    a = a - b;
+    return Array<X, 0>{};
-    return a;
-}
-// Array = Array + Sequence
-template <typename TData, index_t NSize, index_t... Is>
-__host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is...> b)
-{
-    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
-    Array<TData, NSize> result;
-    auto f = math::plus<index_t>{};
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
-    return result;
-}
-// Array = Array - Sequence
-template <typename TData, index_t NSize, index_t... Is>
-__host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is...> b)
-{
-    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
-    Array<TData, NSize> result;
-    auto f = math::minus<index_t>{};
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
-    return result;
-}
-// Array = Array * Sequence
-template <typename TData, index_t NSize, index_t... Is>
-__host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is...> b)
-{
-    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
-    Array<TData, NSize> result;
-    auto f = math::multiplies<index_t>{};
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
-    return result;
-}
-// Array = Sequence - Array
-template <typename TData, index_t NSize, index_t... Is>
-__host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSize> b)
-{
-    static_assert(sizeof...(Is) == NSize, "wrong! size not the same");
-    Array<TData, NSize> result;
-    auto f = math::minus<index_t>{};
-    static_for<0, NSize, 1>{}(
-        lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
-            f, a, b, result));
-    return result;
-}
-// Array = Array * TData
-template <typename TData, index_t NSize>
-__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
-{
-    Array<TData, NSize> result;
-    for(index_t i = 0; i < NSize; ++i)
-    {
-        result(i) = a[i] * v;
-    }
-    return result;
-}
-template <typename TData, index_t NSize, typename Reduce>
-__host__ __device__ constexpr TData
-accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
-{
-    TData result = init;
-    static_assert(NSize > 0, "wrong");
-    static_for<0, NSize, 1>{}([&](auto I) { result = f(result, a[I]); });
-    return result;
 }
 } // namespace ck

--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -2,21 +2,26 @@
 #define CK_COMMON_HEADER_HPP
 #include "config.hpp"
-#include "utility.hpp"
-#include "integral_constant.hpp"
-#include "number.hpp"
-#include "float_type.hpp"
-#include "type.hpp"
-#include "tuple.hpp"
-#include "math.hpp"
-#include "sequence.hpp"
 #include "array.hpp"
+#include "container_helper.hpp"
+#include "statically_indexed_array.hpp"
+#include "container_element_picker.hpp"
+#include "float_type.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "functional3.hpp"
 #include "functional4.hpp"
 #include "in_memory_operation.hpp"
+#include "integral_constant.hpp"
+#include "math.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
+#include "sequence_helper.hpp"
 #include "synchronization.hpp"
+#include "tuple.hpp"
+#include "tuple_helper.hpp"
+#include "type.hpp"
+#include "utility.hpp"
 #if CK_USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hpp"

--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
 #ifndef CK_CONFIG_AMD_HPP
 #define CK_CONFIG_AMD_HPP
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
+#endif
 #include "bfloat16_dev.hpp"
-// index type: unsigned or signed
-#define CK_UNSIGNED_INDEX_TYPE 0
 // device backend
 #define CK_DEVICE_BACKEND_AMD 1
+// GPU ID
+#define CK_AMD_GPU_GFX906 1
+#define CK_AMD_GPU_GFX908 0
+#define CK_AMD_GPU_GFX1030 0
+// HIP version
+#ifndef CK_HIP_VERSION_FLAT
+#define CK_HIP_VERSION_FLAT 0
+#endif
+// launch bounds
+#define CK_USE_LAUNCH_BOUNDS 0
+#ifdef CK_USE_LAUNCH_BOUNDS
+#define CK_MAX_THREAD_PER_BLOCK 256
+#define CK_MIN_BLOCK_PER_CU 1
+#endif
+// buffer resourse
+#if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
+#elif defined(CK_AMD_GPU_GFX1030)
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
+#endif
+// multi index
+#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
 // AMD inline asm
 #ifndef CK_USE_AMD_INLINE_ASM
 #define CK_USE_AMD_INLINE_ASM 1
@@ -20,14 +47,18 @@
 #define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
 #endif
+#ifndef CK_USE_AMD_V_FMAC_F32
+#define CK_USE_AMD_V_FMAC_F32 1
+#endif
 // AMD buffer addressing
 #ifndef CK_USE_AMD_BUFFER_ADDRESSING
 #define CK_USE_AMD_BUFFER_ADDRESSING 1
 #endif
 // only gfx908 support native floating point atomic add
-#ifndef CK_USE_AMD_BUFFER_ATOMIC_ADD
+#ifndef CK_USE_AMD_BUFFER_ATOMIC_FADD
-#define CK_USE_AMD_BUFFER_ATOMIC_ADD 0
+#define CK_USE_AMD_BUFFER_ATOMIC_FADD 0
 #endif
 // AMD XDLOPS
@@ -49,8 +80,16 @@
 #endif
 // experimental implementation
-#ifndef CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
+#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
-#define CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK 1
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+#endif
+#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
+#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
+#endif
+#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK
+#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK_OFFSET_TRICK 1
 #endif
 #ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
@@ -65,14 +104,33 @@
 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
 #endif
+// pass tensor descriptor by value, pointer or void*
+#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1
+#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_POINTER 0
+#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
+// hack: have underlying assumption that need to be satsified, otherwise it's a bug
+// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
+// thread-invariant, otherwise it's a bug
+// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
+#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
+#endif
 // workaround: put all workaround here
-// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
+// workaround for unnecessary VGPR <--> AGPR data movement when using mfma LLVM intrinsic
 #ifndef CK_WORKAROUND_SWDEV_229564
 #define CK_WORKAROUND_SWDEV_229564 1
 #endif
-// workaround for buffer load/store fp16/bfp16 intrinsic bug
-#ifndef CK_WORKAROUND_SWDEV_231101
+// workaround for accvgpr over-allocation
-#define CK_WORKAROUND_SWDEV_231101 1
+#ifndef CK_WORKAROUND_SWDEV_241664
+#define CK_WORKAROUND_SWDEV_241664 1
+#endif
+// workaround for compiler crash when compiling recursive lambda
+#ifndef CK_WORKAROUND_SWDEV_275126
+#define CK_WORKAROUND_SWDEV_275126 1
 #endif
 namespace ck {
@@ -91,14 +149,8 @@ enum InMemoryDataOperation
    AtomicAdd
 };
-#if CK_UNSIGNED_INDEX_TYPE
+// index type
-using index_t = uint32_t;
-#else
 using index_t = int32_t;
-#endif
-// int32x4_t use by buffer_load and buffer_store llvm intrinsic
-typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/container_element_picker.hpp
+++ b/composable_kernel/include/utility/container_element_picker.hpp
+#ifndef CK_CONTAINER_ELEMENT_PICKER_HPP
+#define CK_CONTAINER_ELEMENT_PICKER_HPP
+#include "functional2.hpp"
+#include "sequence.hpp"
+namespace ck {
+// Arr: Array or StaticallyIndexedArray
+// Picks: Sequence<...>
+template <typename Arr, typename Picks>
+struct ContainerElementPicker
+{
+    using type = ContainerElementPicker;
+#if 0
+    using data_type = typename Arr::data_type;
+#endif
+    __host__ __device__ constexpr ContainerElementPicker() = delete;
+    __host__ __device__ constexpr ContainerElementPicker(Arr& array) : mArray{array}
+    {
+        constexpr index_t imax = reduce_on_sequence(Picks{}, math::maxer<index_t>{}, Number<0>{});
+        static_assert(imax < Arr::Size(), "wrong! exceeding # array element");
+    }
+    __host__ __device__ static constexpr auto Size() { return Picks::Size(); }
+    template <index_t I>
+    __host__ __device__ constexpr const auto& At(Number<I> i) const
+    {
+        static_assert(I < Size(), "wrong!");
+        constexpr auto IP = Picks{}[i];
+        return mArray[IP];
+    }
+    template <index_t I>
+    __host__ __device__ constexpr auto& At(Number<I> i)
+    {
+        static_assert(I < Size(), "wrong!");
+        constexpr auto IP = Picks{}[i];
+        return mArray(IP);
+    }
+    template <index_t I>
+    __host__ __device__ constexpr const auto& operator[](Number<I> i) const
+    {
+        return At(i);
+    }
+    template <index_t I>
+    __host__ __device__ constexpr auto& operator()(Number<I> i)
+    {
+        return At(i);
+    }
+    template <typename T>
+    __host__ __device__ constexpr auto operator=(const T& a)
+    {
+        static_assert(T::Size() == Size(), "wrong! size not the same");
+        static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = a[i]; });
+        return *this;
+    }
+    private:
+    Arr& mArray;
+};
+// Arr: Array or StaticallyIndexedArray
+// Picks: Sequence<...>
+template <typename Arr, typename Picks>
+struct ConstantContainerElementPicker
+{
+    using type = ConstantContainerElementPicker;
+#if 0
+    using data_type = typename Arr::data_type;
+#endif
+    __host__ __device__ constexpr ConstantContainerElementPicker() = delete;
+    __host__ __device__ constexpr ConstantContainerElementPicker(const Arr& array) : mArray{array}
+    {
+        constexpr index_t imax = reduce_on_sequence(Picks{}, math::maxer<index_t>{}, Number<0>{});
+        static_assert(imax < Arr::Size(), "wrong! exceeding # array element");
+    }
+    __host__ __device__ static constexpr auto Size() { return Picks::Size(); }
+    template <index_t I>
+    __host__ __device__ constexpr const auto& At(Number<I> i) const
+    {
+        static_assert(I < Size(), "wrong!");
+        constexpr auto IP = Picks{}[i];
+        return mArray[IP];
+    }
+    template <index_t I>
+    __host__ __device__ constexpr const auto& operator[](Number<I> i) const
+    {
+        return At(i);
+    }
+    private:
+    const Arr& mArray;
+};
+template <typename Arr, typename Picks, typename X>
+__host__ __device__ constexpr auto operator+=(ContainerElementPicker<Arr, Picks>& y, const X& x)
+{
+    using Y                 = ContainerElementPicker<Arr, Picks>;
+    constexpr index_t nsize = Y::Size();
+    static_assert(nsize == X::Size(), "wrong! size not the same");
+    static_for<0, nsize, 1>{}([&](auto i) { y(i) += x[i]; });
+    return y;
+}
+template <typename Arr, typename Picks, typename X>
+__host__ __device__ constexpr auto operator-=(ContainerElementPicker<Arr, Picks>& y, const X& x)
+{
+    using Y                 = ContainerElementPicker<Arr, Picks>;
+    constexpr index_t nsize = Y::Size();
+    static_assert(nsize == X::Size(), "wrong! size not the same");
+    static_for<0, nsize, 1>{}([&](auto i) { y(i) -= x[i]; });
+    return y;
+}
+template <typename Arr, typename Picks>
+__host__ __device__ constexpr auto pick_container_element(Arr& a, Picks)
+{
+    return ContainerElementPicker<Arr, Picks>(a);
+}
+template <typename Arr, typename Picks>
+__host__ __device__ constexpr auto pick_container_element(const Arr& a, Picks)
+{
+    return ConstantContainerElementPicker<Arr, Picks>(a);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/container_helper.hpp
+++ b/composable_kernel/include/utility/container_helper.hpp
+#ifndef CK_CONTAINER_HELPER_HPP
+#define CK_CONTAINER_HELPER_HPP
+#include "sequence.hpp"
+#include "sequence_helper.hpp"
+#include "array.hpp"
+#include "tuple.hpp"
+#include "tuple_helper.hpp"
+#include "statically_indexed_array.hpp"
+#include "container_element_picker.hpp"
+namespace ck {
+template <typename TData, index_t NSize>
+__host__ __device__ constexpr auto container_push_back(const Array<TData, NSize>& a, const TData& x)
+{
+    Array<TData, NSize + 1> r;
+    static_for<0, NSize, 1>{}([&r, &a ](auto i) constexpr { r(i) = a[i]; });
+    r(Number<NSize>{}) = x;
+    return r;
+}
+template <typename... Ts, typename T>
+__host__ __device__ constexpr auto container_push_front(const Tuple<Ts...>& a, const T& x)
+{
+    return container_cat(make_tuple(x), a);
+}
+template <typename... Ts, typename T>
+__host__ __device__ constexpr auto container_push_back(const Tuple<Ts...>& a, const T& x)
+{
+    return container_cat(a, make_tuple(x));
+}
+template <typename TData, index_t NSize, index_t... IRs>
+__host__ __device__ constexpr auto
+container_reorder_given_new2old(const Array<TData, NSize>& old_array, Sequence<IRs...> /*new2old*/)
+{
+    static_assert(NSize == sizeof...(IRs), "wrong! size not consistent");
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+    return make_array(old_array[Number<IRs>{}]...);
+}
+template <typename TData, index_t NSize, index_t... IRs>
+__host__ __device__ constexpr auto
+container_reorder_given_old2new(const Array<TData, NSize>& old_array, Sequence<IRs...> old2new)
+{
+    return container_reorder_given_new2old(
+        old_array, typename sequence_map_inverse<decltype(old2new)>::type{});
+}
+template <typename... Ts, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_new2old(const Tuple<Ts...>& old_tuple,
+                                                                   Sequence<IRs...> /*new2old*/)
+{
+    static_assert(sizeof...(Ts) == sizeof...(IRs), "wrong! size not consistent");
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+    return make_tuple(old_tuple[Number<IRs>{}]...);
+}
+template <typename... Ts, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_old2new(const Tuple<Ts...>& old_tuple,
+                                                                   Sequence<IRs...> old2new)
+{
+    return container_reorder_given_new2old(
+        old_tuple, typename sequence_map_inverse<decltype(old2new)>::type{});
+}
+template <index_t... Is, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_new2old(Sequence<Is...> /* old_seq */,
+                                                                   Sequence<IRs...> /*new2old*/)
+{
+    static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+    return Sequence<Sequence<Is...>::At(Number<IRs>{})...>{};
+}
+template <index_t... Is, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_old2new(Sequence<Is...> old_seq,
+                                                                   Sequence<IRs...> /* old2new */)
+{
+    static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+    constexpr auto new2old = typename sequence_map_inverse<Sequence<IRs...>>::type{};
+    return container_reorder_give_new2old(old_seq, new2old);
+}
+#if !CK_WORKAROUND_SWDEV_275126
+// rocm-4.1 compiler would crash for recursive lambda
+template <typename Container,
+          typename Reduce,
+          typename Init,
+          index_t IBegin = 0,
+          index_t IEnd   = Container::Size(),
+          index_t IStep  = 1>
+__host__ __device__ constexpr auto container_reduce(const Container& x,
+                                                    Reduce reduce,
+                                                    Init init,
+                                                    Number<IBegin> = Number<0>{},
+                                                    Number<IEnd>   = Number<Container::Size()>{},
+                                                    Number<IStep>  = Number<1>{})
+{
+    static_assert((IEnd - IBegin) % IStep == 0, "wrong!");
+    // f is recursive function, fs is a dummy of f
+    // i is index, y_old is current scan, r_old is current reduction
+    auto f = [&](auto fs, auto i, auto r_old) {
+        auto r_new = reduce(x[i], r_old);
+        if constexpr(i.value < IEnd - IStep)
+        {
+            // recursively call f/fs
+            return fs(fs, i + Number<IStep>{}, r_new);
+        }
+        else
+        {
+            return r_new;
+        }
+    };
+    // start recursion
+    return f(f, Number<IBegin>{}, init);
+}
+#else
+// i is index, y_old is current scan, r_old is current reduction
+template <typename Container,
+          typename Reduce,
+          typename ROld,
+          index_t I,
+          index_t IEnd,
+          index_t IStep>
+__host__ __device__ constexpr auto container_reduce_impl(
+    const Container& x, Reduce reduce, ROld r_old, Number<I> i, Number<IEnd>, Number<IStep>)
+{
+    auto r_new = reduce(x[i], r_old);
+    if constexpr(i.value < IEnd - IStep)
+    {
+        return container_reduce_impl(
+            x, reduce, r_new, i + Number<IStep>{}, Number<IEnd>{}, Number<IStep>{});
+    }
+    else
+    {
+        return r_new;
+    }
+}
+// rocm-4.1 compiler would crash for recursive lambda
+template <typename Container,
+          typename Reduce,
+          typename Init,
+          index_t IBegin = 0,
+          index_t IEnd   = Container::Size(),
+          index_t IStep  = 1>
+__host__ __device__ constexpr auto container_reduce(const Container& x,
+                                                    Reduce reduce,
+                                                    Init init,
+                                                    Number<IBegin> = Number<0>{},
+                                                    Number<IEnd>   = Number<Container::Size()>{},
+                                                    Number<IStep>  = Number<1>{})
+{
+    static_assert((IEnd - IBegin) % IStep == 0, "wrong!");
+    return container_reduce_impl(
+        x, reduce, init, Number<IBegin>{}, Number<IEnd>{}, Number<IStep>{});
+}
+#endif
+template <typename TData, index_t NSize, typename Reduce>
+__host__ __device__ constexpr auto
+container_reverse_inclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
+{
+    Array<TData, NSize> y;
+    TData r = init;
+    static_for<NSize - 1, 0, -1>{}([&](auto i) {
+        r    = f(r, x[i]);
+        y(i) = r;
+    });
+    r              = f(r, x[Number<0>{}]);
+    y(Number<0>{}) = r;
+    return y;
+}
+template <typename TData, index_t NSize, typename Reduce>
+__host__ __device__ constexpr auto
+container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
+{
+    Array<TData, NSize> y;
+    TData r = init;
+    static_for<NSize - 1, 0, -1>{}([&](auto i) {
+        y(i) = r;
+        r    = f(r, x[i]);
+    });
+    y(Number<0>{}) = r;
+    return y;
+}
+#if !CK_WORKAROUND_SWDEV_275126
+// rocm4.1 compiler would crash with recursive lambda
+template <typename... Xs, typename Reduce, typename Init>
+__host__ __device__ constexpr auto
+container_reverse_exclusive_scan(const Tuple<Xs...>& x, Reduce reduce, Init init)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+    // f is recursive function, fs is a dummy of f
+    // i is index, y_old is current scan, r_old is current reduction
+    auto f = [&](auto fs, auto i, auto y_old, auto r_old) {
+        auto r_new = reduce(x[i], r_old);
+        auto y_new = container_push_front(y_old, r_new);
+        if constexpr(i.value > 1)
+        {
+            // recursively call f/fs
+            return fs(fs, i - Number<1>{}, y_new, r_new);
+        }
+        else
+        {
+            return y_new;
+        }
+    };
+    // start recursion
+    return f(f, Number<NSize - 1>{}, make_tuple(init), init);
+}
+#else
+// i is index, y_old is current scan, r_old is current reduction
+template <typename... Xs, typename Reduce, index_t I, typename YOld, typename ROld>
+__host__ __device__ constexpr auto container_reverse_exclusive_scan_impl(
+    const Tuple<Xs...>& x, Reduce reduce, Number<I> i, YOld y_old, ROld r_old)
+{
+    auto r_new = reduce(x[i], r_old);
+    auto y_new = container_push_front(y_old, r_new);
+    if constexpr(i.value > 1)
+    {
+        // recursively call f/fs
+        return container_reverse_exclusive_scan_impl(x, reduce, i - Number<1>{}, y_new, r_new);
+    }
+    else
+    {
+        return y_new;
+    }
+}
+template <typename... Xs, typename Reduce, typename Init>
+__host__ __device__ constexpr auto
+container_reverse_exclusive_scan(const Tuple<Xs...>& x, Reduce reduce, Init init)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+    return container_reverse_exclusive_scan_impl(
+        x, reduce, Number<NSize - 1>{}, make_tuple(init), init);
+}
+#endif
+// TODO: update to like container_reverse_exclusive_scan to deal with Tuple of Numebr<>
+template <typename... Xs, typename Reduce, typename TData>
+__host__ __device__ constexpr auto
+container_reverse_inclusive_scan(const Tuple<Xs...>& x, Reduce f, TData init)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+    Tuple<Xs...> y;
+    TData r = init;
+    static_for<NSize - 1, 0, -1>{}([&](auto i) {
+        r    = f(r, x[i]);
+        y(i) = r;
+    });
+    r              = f(r, x[Number<0>{}]);
+    y(Number<0>{}) = r;
+    return y;
+}
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto container_cat(const X& x, const Ys&... ys)
+{
+    return container_cat(x, container_cat(ys...));
+}
+template <typename T, index_t NX, index_t NY>
+__host__ __device__ constexpr auto container_cat(const Array<T, NX>& ax, const Array<T, NY>& ay)
+{
+    return unpack2(
+        [&](auto&&... zs) { return make_array(std::forward<decltype(zs)>(zs)...); }, ax, ay);
+}
+template <typename... X, typename... Y>
+__host__ __device__ constexpr auto container_cat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
+{
+    return unpack2(
+        [&](auto&&... zs) { return make_tuple(std::forward<decltype(zs)>(zs)...); }, tx, ty);
+}
+template <typename Container>
+__host__ __device__ constexpr auto container_cat(const Container& x)
+{
+    return x;
+}
+template <typename T, index_t N, index_t... Is>
+__host__ __device__ constexpr auto get_container_subset(const Array<T, N>& arr, Sequence<Is...>)
+{
+    static_assert(N >= sizeof...(Is), "wrong! size");
+    return make_array(arr[Number<Is>{}]...);
+}
+template <typename... Ts, index_t... Is>
+__host__ __device__ constexpr auto get_container_subset(const Tuple<Ts...>& tup, Sequence<Is...>)
+{
+    static_assert(sizeof...(Ts) >= sizeof...(Is), "wrong! size");
+    return make_tuple(tup[Number<Is>{}]...);
+}
+template <typename T, index_t N, index_t... Is>
+__host__ __device__ constexpr void
+set_container_subset(Array<T, N>& y, Sequence<Is...> picks, const Array<T, sizeof...(Is)>& x)
+{
+    static_assert(N >= sizeof...(Is), "wrong! size");
+    static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
+}
+template <typename... Ys, index_t... Is, typename... Xs>
+__host__ __device__ constexpr void
+set_container_subset(Tuple<Ys...>& y, Sequence<Is...> picks, const Tuple<Xs...>& x)
+{
+    static_assert(sizeof...(Ys) >= sizeof...(Is) && sizeof...(Is) == sizeof...(Xs), "wrong! size");
+    static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
+}
+template <index_t... Is>
+__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence<Is...>)
+{
+    using Seq = Sequence<Is...>;
+    return generate_tuple(
+        [&](auto i) {
+            constexpr index_t tmp = Seq::At(i);
+            return Number<tmp>{};
+        },
+        Seq::Size());
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
@@ -3,264 +3,279 @@
 namespace ck {
-// For some reason, HIP compiler need this definition to generate optimal ISA
+template <typename T, index_t N>
-// float
+struct vector_type;
-typedef float float2_t __attribute__((ext_vector_type(2)));
-typedef float float4_t __attribute__((ext_vector_type(4)));
-typedef float float16_t __attribute__((ext_vector_type(16)));
-typedef float float32_t __attribute__((ext_vector_type(32)));
-// float16
-typedef _Float16 half_t;
-typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
-typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
-typedef _Float16 half8_t __attribute__((ext_vector_type(8)));
-// bfloat16
-typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
-typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
-typedef ushort ushort8_t __attribute__((ext_vector_type(8)));
-template <class T, index_t N>
-struct vector_type
-{
-    typedef struct
-    {
-        T scalar[N];
-    } MemoryType;
-};
-template <>
+template <typename T>
-struct vector_type<float, 1>
+struct vector_type<T, 1>
 {
-    using MemoryType = float;
+    using type = T;
-    template <index_t I>
+    union
-    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
    {
-        static_assert(I < 1, "wrong");
+        T d1_;
-        *(reinterpret_cast<float*>(&v) + I) = s;
+        StaticallyIndexedArray<T, 1> d1x1_;
-    }
+    } data_;
-};
-template <>
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-struct vector_type<float, 2>
-{
-    using MemoryType = float2_t;
-    union DataType
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-    {
-        MemoryType vector;
-        float scalar[2];
-    };
-    template <index_t I>
+    __host__ __device__ static constexpr index_t Size() { return 1; }
-    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
-    {
-        static_assert(I < 2, "wrong");
-        *(reinterpret_cast<float*>(&v) + I) = s;
-    }
-    __host__ __device__ static MemoryType Pack(float s0, float s1)
+    __host__ __device__ constexpr const auto& Vector() const { return data_.d1_; }
-    {
-        DataType data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        return data.vector;
-    }
-};
-template <>
+    __host__ __device__ constexpr auto& Vector() { return data_.d1_; }
-struct vector_type<float, 4>
-{
-    using MemoryType = float4_t;
-    __host__ __device__ static constexpr index_t GetSize() { return 4; }
+    __host__ __device__ constexpr const auto& Scalars() const { return data_.d1x1_; }
-    template <index_t I>
+    __host__ __device__ constexpr auto& Scalars() { return data_.d1x1_; }
-    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
-    {
-        static_assert(I < 4, "wrong");
-        *(reinterpret_cast<float*>(&v) + I) = s;
-    }
-};
-template <>
+    __host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x1_; }
-struct vector_type<half_t, 1>
-{
-    using MemoryType = half_t;
-    template <index_t I>
+    __host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x1_; }
-    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
-    {
-        static_assert(I < 1, "wrong");
-        *(reinterpret_cast<half_t*>(&v) + I) = s;
-    }
 };
-template <>
+template <typename T>
-struct vector_type<half_t, 2>
+struct vector_type<T, 2>
 {
-    using MemoryType = half2_t;
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
-    union DataType
+    using type = d2_t;
-    {
-        MemoryType vector;
-        half_t scalar[2];
-    };
-    template <index_t I>
+    union
-    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
    {
-        static_assert(I < 2, "wrong");
+        d2_t d2_;
-        *(reinterpret_cast<half_t*>(&v) + I) = s;
+        StaticallyIndexedArray<d1_t, 2> d1x2_;
-    }
+        StaticallyIndexedArray<d2_t, 1> d2x1_;
+    } data_;
-    __host__ __device__ static MemoryType Pack(half_t s0, half_t s1)
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-    {
-        DataType data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        return data.vector;
-    }
-};
-template <>
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-struct vector_type<half_t, 4>
-{
-    using MemoryType = half4_t;
-    union DataType
+    __host__ __device__ static constexpr index_t Size() { return 2; }
-    {
-        MemoryType vector;
-        half_t scalar[4];
-    };
-    template <index_t I>
+    __host__ __device__ constexpr const auto& Vector() const { return data_.d2_; }
-    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
-    {
-        static_assert(I < 4, "wrong");
-        *(reinterpret_cast<half_t*>(&v) + I) = s;
-    }
-    __host__ __device__ static MemoryType Pack(half_t s0, half_t s1, half_t s2, half_t s3)
+    __host__ __device__ constexpr auto& Vector() { return data_.d2_; }
-    {
-        DataType data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        data.scalar[2] = s2;
-        data.scalar[3] = s3;
-        return data.vector;
-    }
-};
-template <>
+    __host__ __device__ constexpr const auto& Scalars() const { return data_.d1x2_; }
-struct vector_type<half_t, 8>
-{
-    using MemoryType = half8_t;
-    union DataType
+    __host__ __device__ constexpr auto& Scalars() { return data_.d1x2_; }
-    {
-        MemoryType vector;
-        half_t scalar[8];
-    };
-    template <index_t I>
+    __host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x2_; }
-    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
-    {
+    __host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x1_; }
-        static_assert(I < 8, "wrong");
-        *(reinterpret_cast<half_t*>(&v) + I) = s;
+    __host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x2_; }
-    }
+    __host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x1_; }
 };
-template <>
+template <typename T>
-struct vector_type<ushort, 1>
+struct vector_type<T, 4>
 {
-    using MemoryType = ushort;
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
-    template <index_t I>
+    using type = d4_t;
-    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    union
    {
-        static_assert(I < 1, "wrong");
+        d4_t d4_;
-        *(reinterpret_cast<ushort*>(&v) + I) = s;
+        StaticallyIndexedArray<d1_t, 4> d1x4_;
-    }
+        StaticallyIndexedArray<d2_t, 2> d2x2_;
+        StaticallyIndexedArray<d4_t, 1> d4x1_;
+    } data_;
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+    __host__ __device__ static constexpr index_t Size() { return 4; }
+    __host__ __device__ constexpr const auto& Vector() const { return data_.d4_; }
+    __host__ __device__ constexpr auto& Vector() { return data_.d4_; }
+    __host__ __device__ constexpr const auto& Scalars() const { return data_.d1x4_; }
+    __host__ __device__ constexpr auto& Scalars() { return data_.d1x4_; }
+    __host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x4_; }
+    __host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x2_; }
+    __host__ __device__ constexpr const auto& Vectors(Number<4>) const { return data_.d4x1_; }
+    __host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x4_; }
+    __host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x2_; }
+    __host__ __device__ constexpr auto& Vectors(Number<4>) { return data_.d4x1_; }
 };
-template <>
+template <typename T>
-struct vector_type<ushort, 2>
+struct vector_type<T, 8>
 {
-    using MemoryType = ushort2_t;
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
-    union DataType
+    using type = d8_t;
-    {
-        MemoryType vector;
-        ushort scalar[2];
-    };
-    template <index_t I>
+    union
-    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
    {
-        static_assert(I < 2, "wrong");
+        d8_t d8_;
-        *(reinterpret_cast<ushort*>(&v) + I) = s;
+        StaticallyIndexedArray<d1_t, 8> d1x8_;
-    }
+        StaticallyIndexedArray<d2_t, 4> d2x4_;
+        StaticallyIndexedArray<d4_t, 2> d4x2_;
+        StaticallyIndexedArray<d8_t, 1> d8x1_;
+    } data_;
-    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-    {
-        DataType data;
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
+    __host__ __device__ static constexpr index_t Size() { return 8; }
-        return data.vector;
-    }
+    __host__ __device__ constexpr const auto& Vector() const { return data_.d8_; }
+    __host__ __device__ constexpr auto& Vector() { return data_.d8_; }
+    __host__ __device__ constexpr const auto& Scalars() const { return data_.d1x8_; }
+    __host__ __device__ constexpr auto& Scalars() { return data_.d1x8_; }
+    __host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x8_; }
+    __host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x4_; }
+    __host__ __device__ constexpr const auto& Vectors(Number<4>) const { return data_.d4x2_; }
+    __host__ __device__ constexpr const auto& Vectors(Number<8>) const { return data_.d8x1_; }
+    __host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x8_; }
+    __host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x4_; }
+    __host__ __device__ constexpr auto& Vectors(Number<4>) { return data_.d4x2_; }
+    __host__ __device__ constexpr auto& Vectors(Number<8>) { return data_.d8x1_; }
 };
 template <>
-struct vector_type<ushort, 4>
+struct vector_type<int8_t, 2>
 {
-    using MemoryType = ushort4_t;
+    using d1_t = int8_t;
+    typedef int16_t d2_t;
-    union DataType
+    using type = d2_t;
-    {
-        MemoryType vector;
-        ushort scalar[4];
-    };
-    template <index_t I>
+    union
-    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
    {
-        static_assert(I < 4, "wrong");
+        d2_t d2_;
-        *(reinterpret_cast<ushort*>(&v) + I) = s;
+        StaticallyIndexedArray<d1_t, 2> d1x2_;
-    }
+        StaticallyIndexedArray<d2_t, 1> d2x1_;
+    } data_;
-    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
-    {
-        DataType data;
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
+    __host__ __device__ static constexpr index_t Size() { return 2; }
-        data.scalar[2] = s2;
-        data.scalar[3] = s3;
+    __host__ __device__ constexpr const auto& Vector() const { return data_.d2_; }
-        return data.vector;
-    }
+    __host__ __device__ constexpr auto& Vector() { return data_.d2_; }
+    __host__ __device__ constexpr const auto& Scalars() const { return data_.d1x2_; }
+    __host__ __device__ constexpr auto& Scalars() { return data_.d1x2_; }
+    __host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x2_; }
+    __host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x1_; }
+    __host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x2_; }
+    __host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x1_; }
 };
 template <>
-struct vector_type<ushort, 8>
+struct vector_type<int8_t, 4>
 {
-    using MemoryType = ushort8_t;
+    using d1_t = int8_t;
+    typedef int16_t d2_t;
+    typedef int32_t d4_t;
-    union DataType
+    using type = d4_t;
-    {
-        MemoryType vector;
-        ushort scalar[8];
-    };
-    template <index_t I>
+    union
-    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
    {
-        static_assert(I < 8, "wrong");
+        d4_t d4_;
-        *(reinterpret_cast<ushort*>(&v) + I) = s;
+        StaticallyIndexedArray<d1_t, 4> d1x4_;
-    }
+        StaticallyIndexedArray<d2_t, 2> d2x2_;
+        StaticallyIndexedArray<d4_t, 1> d4x1_;
+    } data_;
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+    __host__ __device__ static constexpr index_t Size() { return 4; }
+    __host__ __device__ constexpr const auto& Vector() const { return data_.d4_; }
+    __host__ __device__ constexpr auto& Vector() { return data_.d4_; }
+    __host__ __device__ constexpr const auto& Scalars() const { return data_.d1x4_; }
+    __host__ __device__ constexpr auto& Scalars() { return data_.d1x4_; }
+    __host__ __device__ constexpr const auto& Vectors(Number<1>) const { return data_.d1x4_; }
+    __host__ __device__ constexpr const auto& Vectors(Number<2>) const { return data_.d2x2_; }
+    __host__ __device__ constexpr const auto& Vectors(Number<4>) const { return data_.d4x1_; }
+    __host__ __device__ constexpr auto& Vectors(Number<1>) { return data_.d1x4_; }
+    __host__ __device__ constexpr auto& Vectors(Number<2>) { return data_.d2x2_; }
+    __host__ __device__ constexpr auto& Vectors(Number<4>) { return data_.d4x1_; }
 };
+// fp32
+using float2_t = typename vector_type<float, 2>::type;
+using float4_t = typename vector_type<float, 4>::type;
+using float8_t = typename vector_type<float, 8>::type;
+// fp16
+using half_t  = _Float16;
+using half2_t = typename vector_type<half_t, 2>::type;
+using half4_t = typename vector_type<half_t, 4>::type;
+using half8_t = typename vector_type<half_t, 8>::type;
+// bfp16
+using ushort2_t = typename vector_type<ushort, 2>::type;
+using ushort4_t = typename vector_type<ushort, 4>::type;
+using ushort8_t = typename vector_type<ushort, 8>::type;
+// i32
+using int32x2_t = typename vector_type<int32_t, 2>::type;
+using int32x4_t = typename vector_type<int32_t, 4>::type;
+using int32x8_t = typename vector_type<int32_t, 8>::type;
+// i8
+// hack for int8x4_t, because compiler does not have native support for int8x4_t
+// int8x4_t is defined as int32_t
+using int8x4_t = typename vector_type<int8_t, 4>::type;
 // data type conversion
 template <typename T>
 struct type_convert
@@ -291,113 +306,37 @@ struct inner_product_with_conversion
 {
    static constexpr auto convert = type_convert<T>();
-    __device__ T operator()(float4_t a, float4_t b) const
+    template <typename X, index_t N>
-    {
+    __device__ T operator()(typename vector_type<X, N>::type a,
-        const float* p_a_float = reinterpret_cast<const float*>(&a);
+                            typename vector_type<X, N>::type b) const
-        const float* p_b_float = reinterpret_cast<const float*>(&b);
-        T acc = 0;
-        for(index_t v = 0; v < 4; ++v)
-        {
-            acc += convert(p_a_float[v]) * convert(p_b_float[v]);
-        }
-        return acc;
-    }
-    __device__ T operator()(float2_t a, float2_t b) const
-    {
-        const float* p_a_float = reinterpret_cast<const float*>(&a);
-        const float* p_b_float = reinterpret_cast<const float*>(&b);
-        T acc = 0;
-        for(index_t v = 0; v < 2; ++v)
-        {
-            acc += convert(p_a_float[v]) * convert(p_b_float[v]);
-        }
-        return acc;
-    }
-    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
-    __device__ T operator()(half2_t a, half2_t b) const
-    {
-        const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
-        const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);
-        T acc = 0;
-        for(index_t v = 0; v < 2; ++v)
-        {
-            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
-        }
-        return acc;
-    }
-    __device__ T operator()(half4_t a, half4_t b) const
    {
-        const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
+        const vector_type<X, N> a_vector{a};
-        const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);
+        const vector_type<X, N> b_vector{b};
        T acc = 0;
-        for(index_t v = 0; v < 4; ++v)
-        {
-            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
-        }
-        return acc;
-    }
-    __device__ T operator()(half8_t a, half8_t b) const
+        static_for<0, N, 1>{}([&](auto i) {
-    {
+            acc += convert(a_vector.Scalars()[i]) * convert(b_vector.Scalars()[i]);
-        const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
+        });
-        const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);
-        T acc = 0;
-        for(index_t v = 0; v < 8; ++v)
-        {
-            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
-        }
        return acc;
    }
-    __device__ T operator()(ushort2_t a, ushort2_t b) const
+    __device__ T operator()(float_t a, float_t b) const { return convert(a) * convert(b); }
-    {
-        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
-        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
-        T acc = 0;
+    // hack for int8x4_t, because compiler does not have native support for int8x4_t
-        for(index_t v = 0; v < 2; ++v)
+    // int8x4_t is defined as int32_t
-        {
+    __device__ T operator()(int8x4_t a, int8x4_t b) const
-            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
-        }
-        return acc;
-    }
-    __device__ T operator()(ushort4_t a, ushort4_t b) const
    {
-        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const vector_type<int8_t, 4> a_vector{a};
-        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+        const vector_type<int8_t, 4> b_vector{b};
        T acc = 0;
-        for(index_t v = 0; v < 4; ++v)
-        {
-            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
-        }
-        return acc;
-    }
-    __device__ T operator()(ushort8_t a, ushort8_t b) const
+        static_for<0, 4, 1>{}([&](auto i) {
-    {
+            acc += convert(a_vector.Scalars()[i]) * convert(b_vector.Scalars()[i]);
-        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        });
-        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
-        T acc = 0;
-        for(index_t v = 0; v < 8; ++v)
-        {
-            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
-        }
        return acc;
    }
 };

--- a/composable_kernel/include/utility/float_type.nvidia.hpp.in
+++ b/composable_kernel/include/utility/float_type.nvidia.hpp.in
@@ -32,16 +32,16 @@ struct vector_type
    typedef struct
    {
        T scalar[N];
-    } MemoryType;
+    } type;
 };
 template <>
 struct vector_type<float, 1>
 {
-    using MemoryType = float;
+    using type = float;
    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    __host__ __device__ static void SetScalar(type& v, float s, Number<I>)
    {
        static_assert(I < 1, "wrong");
        *(reinterpret_cast<float*>(&v) + I) = s;
@@ -51,22 +51,22 @@ struct vector_type<float, 1>
 template <>
 struct vector_type<float, 2>
 {
-    using MemoryType = float2_t;
+    using type = float2_t;
    union DataType
    {
-        MemoryType vector;
+        type vector;
        float scalar[2];
    };
    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    __host__ __device__ static void SetScalar(type& v, float s, Number<I>)
    {
        static_assert(I < 2, "wrong");
        *(reinterpret_cast<float*>(&v) + I) = s;
    }
-    __host__ __device__ static MemoryType Pack(float s0, float s1)
+    __host__ __device__ static type Pack(float s0, float s1)
    {
        DataType data;
        data.scalar[0] = s0;
@@ -78,12 +78,12 @@ struct vector_type<float, 2>
 template <>
 struct vector_type<float, 4>
 {
-    using MemoryType = float4_t;
+    using type = float4_t;
    __host__ __device__ static constexpr index_t GetSize() { return 4; }
    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    __host__ __device__ static void SetScalar(type& v, float s, Number<I>)
    {
        static_assert(I < 4, "wrong");
        *(reinterpret_cast<float*>(&v) + I) = s;
@@ -93,10 +93,10 @@ struct vector_type<float, 4>
 template <>
 struct vector_type<half_t, 1>
 {
-    using MemoryType = half_t;
+    using type = half_t;
    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
+    __host__ __device__ static void SetScalar(type& v, half_t s, Number<I>)
    {
        static_assert(I < 1, "wrong");
        *(reinterpret_cast<half_t*>(&v) + I) = s;
@@ -106,22 +106,22 @@ struct vector_type<half_t, 1>
 template <>
 struct vector_type<half_t, 2>
 {
-    using MemoryType = half2_t;
+    using type = half2_t;
    union DataType
    {
-        MemoryType vector;
+        type vector;
        half_t scalar[2];
    };
    template <index_t I>
-    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
+    __host__ __device__ static void SetScalar(type& v, half_t s, Number<I>)
    {
        static_assert(I < 2, "wrong");
        *(reinterpret_cast<half_t*>(&v) + I) = s;
    }
-    __host__ __device__ static MemoryType Pack(half_t s0, half_t s1)
+    __host__ __device__ static type Pack(half_t s0, half_t s1)
    {
        DataType data;
        data.scalar[0] = s0;

--- a/composable_kernel/include/utility/functional.hpp
+++ b/composable_kernel/include/utility/functional.hpp
@@ -2,7 +2,6 @@
 #define CK_FUNCTIONAL_HPP
 #include "integral_constant.hpp"
-#include "sequence.hpp"
 #include "type.hpp"
 namespace ck {
@@ -56,8 +55,10 @@ struct static_if<true>
    __host__ __device__ constexpr auto operator()(F f) const
    {
        // This is a trick for compiler:
-        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
+        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
-        //   this will make "f" a generic lambda, so that "f" won't be compiled until being
+        //   use it,
+        //   this will make "f" a generic lambda, so that "f" won't be compiled
+        //   until being
        //   instantiated here
        f(forwarder{});
        return Type{};
@@ -84,8 +85,10 @@ struct static_if<false>
    __host__ __device__ static void Else(F f)
    {
        // This is a trick for compiler:
-        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
+        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
-        //   this will make "f" a generic lambda, so that "f" won't be compiled until being
+        //   use it,
+        //   this will make "f" a generic lambda, so that "f" won't be compiled
+        //   until being
        //   instantiated here
        f(forwarder{});
    }

--- a/composable_kernel/include/utility/functional2.hpp
+++ b/composable_kernel/include/utility/functional2.hpp
@@ -32,7 +32,8 @@ struct static_for
        static_assert(Increment != 0 && (NEnd - NBegin) % Increment == 0,
                      "Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
        static_assert((Increment > 0 && NBegin <= NEnd) || (Increment < 0 && NBegin >= NEnd),
-                      "wrongs! should have NBegin <= NEnd");
+                      "wrongs! should (Increment > 0 && NBegin <= NEnd) || (Increment < 0 && "
+                      "NBegin >= NEnd)");
    }
    template <class F>

--- a/composable_kernel/include/utility/functional3.hpp
+++ b/composable_kernel/include/utility/functional3.hpp
@@ -4,7 +4,7 @@
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "sequence.hpp"
-#include "array.hpp"
+#include "multi_index.hpp"
 namespace ck {
@@ -63,7 +63,7 @@ struct ford_impl
        for(index_t i = 0; i < RemainLengths::Front(); ++i)
        {
            ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
-                f, current_ordered_id.PushBack(i));
+                f, container_push_back(current_ordered_id, i));
        }
    }
 };
@@ -77,14 +77,16 @@ struct ford_impl<Sequence<>, Orders>
    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
    {
        // retrive unordered Id
-        f(reorder_array_given_old2new(current_ordered_id, Orders{}));
+        f(container_reorder_given_old2new(current_ordered_id, Orders{}));
    }
 };
 } // namespace detail
-// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
+// Lengths is Sequence<...>, it is the length of each dimension for
-// Orders is Sequence<...>, it is the order of dimension in which static_ford will loop over each
+// N-dimensional loop
+// Orders is Sequence<...>, it is the order of dimension in which static_ford
+// will loop over each
 // dimension
 template <class Lengths,
          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
@@ -106,8 +108,10 @@ struct static_ford
    }
 };
-// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
+// Lengths is Sequence<...>, it is the length of each dimension for
-// Orders is Sequence<...>, it is the order of dimension in which ford will loop over each
+// N-dimensional loop
+// Orders is Sequence<...>, it is the order of dimension in which ford will loop
+// over each
 // dimension
 template <class Lengths,
          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
@@ -129,7 +133,7 @@ struct ford
        for(index_t i = 0; i < ordered_lengths.Front(); ++i)
        {
            detail::ford_impl<decltype(ordered_lengths.PopFront()), Orders>{}(f,
-                                                                              Array<index_t, 1>{i});
+                                                                              make_multi_index(i));
        }
    }
 };

--- a/composable_kernel/include/utility/functional4.hpp
+++ b/composable_kernel/include/utility/functional4.hpp
@@ -16,18 +16,46 @@ template <index_t... Is>
 struct unpack_impl<Sequence<Is...>>
 {
    template <typename F, typename X>
-    __host__ __device__ constexpr auto operator()(F f, const X& x) const
+    __host__ __device__ constexpr auto operator()(F&& f, X&& x) const
    {
-        return f(x.At(Number<Is>{})...);
+        return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...);
+    }
+};
+template <typename Seq0, typename Seq1>
+struct unpack2_impl;
+// TODO: remove this, after properly implementing unpack that takes any number of containers
+template <index_t... Is, index_t... Js>
+struct unpack2_impl<Sequence<Is...>, Sequence<Js...>>
+{
+    template <typename F, typename X, typename Y>
+    __host__ __device__ constexpr auto operator()(F&& f, X&& x, Y&& y) const
+    {
+        return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...,
+                                  std::forward<Y>(y).At(Number<Js>{})...);
    }
 };
 } // namespace detail
 template <typename F, typename X>
-__host__ __device__ constexpr auto unpack(F f, const X& x)
+__host__ __device__ constexpr auto unpack(F&& f, X&& x)
+{
+    using X_ = remove_reference_t<X>;
+    return detail::unpack_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type>{}(
+        std::forward<F>(f), std::forward<X>(x));
+}
+// TODO: properly implement unpack that takes any number of containers
+template <typename F, typename X, typename Y>
+__host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y)
 {
-    return detail::unpack_impl<typename arithmetic_sequence_gen<0, X::Size(), 1>::type>{}(f, x);
+    using X_ = remove_reference_t<X>;
+    using Y_ = remove_reference_t<Y>;
+    return detail::unpack2_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type,
+                                typename arithmetic_sequence_gen<0, Y_::Size(), 1>::type>{}(
+        std::forward<F>(f), std::forward<X>(x), std::forward<Y>(y));
 }
 } // namespace ck

--- a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in
+++ b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in
@@ -5,6 +5,7 @@
 #if CK_USE_AMD_BUFFER_ADDRESSING
 #include "amd_buffer_addressing.hpp"
+#include "amd_buffer_addressing_v2.hpp"
 #endif
 namespace ck {
@@ -43,7 +44,7 @@ __device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
 template <typename T, index_t DataPerAccess>
 struct SetData
 {
-    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
+    using vector_t = typename vector_type<T, DataPerAccess>::type;
    // This version is only for compatibility, don't use this version if possible
    template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
@@ -60,8 +61,13 @@ struct SetData
        {
            if(src_valid)
            {
+#if 0
                *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
                    *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
+#else
+                *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
+                    *reinterpret_cast<const vector_t*>(&p_src[0x3fffffff & src_offset]);
+#endif
            }
            else
            {
@@ -88,7 +94,7 @@ struct SetData
        if(dst_valid)
        {
            *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
-                amd_buffer_load<T, DataPerAccess>(p_src, src_offset, src_valid, src_range);
+                amd_buffer_load_v2<T, DataPerAccess>(p_src, src_offset, src_valid, src_range);
        }
    }
@@ -108,12 +114,12 @@ struct SetData
    {
        const auto zeros = vector_t(0);
-        amd_buffer_store<T, DataPerAccess>(src_valid ? &(p_src[src_offset])
+        amd_buffer_store_v2<T, DataPerAccess>(
-                                                     : reinterpret_cast<const T*>(&zeros),
+            src_valid ? *reinterpret_cast<const vector_t*>(&(p_src[src_offset])) : zeros,
-                                           p_dst,
+            p_dst,
-                                           dst_offset,
+            dst_offset,
-                                           dst_valid,
+            dst_valid,
-                                           dst_range);
+            dst_range);
    }
 #endif
 };
@@ -121,7 +127,7 @@ struct SetData
 template <typename T, index_t DataPerAccess>
 struct AtomicAddData
 {
-    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
+    using vector_t = typename vector_type<T, DataPerAccess>::type;
    // This version is only for compatibility, don't use this version if possible
    template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
@@ -141,7 +147,7 @@ struct AtomicAddData
        }
    }
-#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
+#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_FADD
    // buffer_atomic requires:
    //   1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
    //   2) p_dst_thread to be a wavewise pointer.
@@ -185,25 +191,26 @@ __device__ void transfer_data(const T* p_src,
                  "wrong! InMemoryDataOperation not supported!");
    // keep it simple, don't use static_if here, otherwise compiler will do weird things
-    if(SrcDataStride == 1 && DstDataStride == 1)
+    if constexpr(SrcDataStride == 1 && DstDataStride == 1)
    {
-        // TODO: use static_if::ElseIf
+        if constexpr(DstInMemOp == InMemoryDataOperation::Set)
-        static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
+        {
            SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
                p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range);
-        });
+        }
+        else if constexpr(DstInMemOp == InMemoryDataOperation::AtomicAdd)
-        static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
+        {
            AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
                p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range);
-        });
+        }
    }
    else
    {
+#pragma unroll
        for(index_t i = 0; i < DataPerAccess; ++i)
        {
-            // TODO: use static_if::ElseIf
+            if constexpr(DstInMemOp == InMemoryDataOperation::Set)
-            static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
+            {
                SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
                    p_src,
                    src_offset + i * SrcDataStride,
@@ -213,9 +220,9 @@ __device__ void transfer_data(const T* p_src,
                    dst_offset + i * DstDataStride,
                    dst_valid,
                    dst_range);
-            });
+            }
+            else if constexpr(DstInMemOp == InMemoryDataOperation::AtomicAdd)
-            static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
+            {
                AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
                    p_src,
                    src_offset + i * SrcDataStride,
@@ -225,7 +232,7 @@ __device__ void transfer_data(const T* p_src,
                    dst_offset + i * DstDataStride,
                    dst_valid,
                    dst_range);
-            });
+            }
        }
    }
 }

--- a/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
+++ b/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
@@ -37,7 +37,7 @@ __device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
 template <typename T, index_t DataPerAccess>
 struct SetData
 {
-    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
+    using vector_t = typename vector_type<T, DataPerAccess>::type;
    template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
    __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
@@ -50,7 +50,7 @@ struct SetData
 template <typename T, index_t DataPerAccess>
 struct AtomicAddData
 {
-    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
+    using vector_t = typename vector_type<T, DataPerAccess>::type;
    template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
    __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const

--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -33,6 +33,15 @@ struct multiplies
    __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
 };
+struct multiplies_v2
+{
+    template <typename A, typename B>
+    __host__ __device__ constexpr auto operator()(const A& a, const B& b) const
+    {
+        return a * b;
+    }
+};
 template <class T>
 struct maxer
 {
@@ -105,8 +114,7 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
 }
 // greatest common divisor, aka highest common factor
-template <typename T>
+__host__ __device__ constexpr index_t gcd(index_t x, index_t y)
-__host__ __device__ constexpr T gcd(T x, T y)
 {
    if(x == y || x == 0)
    {
@@ -129,24 +137,29 @@ __host__ __device__ constexpr T gcd(T x, T y)
 template <index_t X, index_t Y>
 __host__ __device__ constexpr auto gcd(Number<X>, Number<Y>)
 {
-    constexpr auto result = gcd(X, Y);
+    constexpr auto r = gcd(X, Y);
-    return Number<result>{};
+    return Number<r>{};
 }
-template <typename X, typename... Ys>
+template <typename X,
+          typename... Ys,
+          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
 __host__ __device__ constexpr auto gcd(X x, Ys... ys)
 {
    return gcd(x, ys...);
 }
 // least common multiple
-template <typename T>
+template <typename X, typename Y>
-__host__ __device__ constexpr T lcm(T x, T y)
+__host__ __device__ constexpr auto lcm(X x, Y y)
 {
    return (x * y) / gcd(x, y);
 }
-template <typename X, typename... Ys>
+template <typename X,
+          typename... Ys,
+          typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
 __host__ __device__ constexpr auto lcm(X x, Ys... ys)
 {
    return lcm(x, lcm(ys...));
@@ -165,6 +178,6 @@ struct less
 };
 } // namespace math
-} // namspace ck
+} // namespace ck
 #endif
--- a/composable_kernel/include/utility/print.hpp
+++ b/composable_kernel/include/utility/print.hpp
+#ifndef CK_PRINT_HPP
+#define CK_PRINT_HPP
+#include "array.hpp"
+#include "statically_indexed_array.hpp"
+#include "container_helper.hpp"
+#include "sequence.hpp"
+namespace ck {
+template <typename T>
+__host__ __device__ void print_array(const char* s, T a)
+{
+    using data_type         = decltype(a.At(Number<0>{}));
+    constexpr index_t nsize = a.Size();
+#if 0
+    if constexpr(is_same<data_type, uint32_t>{})
+    {
+        printf("%s size %u, {", s, nsize);
+        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%u, ", uint32_t{a[i]}); });
+        printf("}\n");
+    }
+    else if constexpr(is_same<data_type, int32_t>{})
+    {
+        printf("%s size %d, {", s, nsize);
+        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
+        printf("}\n");
+    }
+    else if constexpr(is_same<data_type, bool>{})
+    {
+        printf("%s size %d, {", s, nsize);
+        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", bool{a[i]}); });
+        printf("}\n");
+    }
+#else
+    printf("%s size %d, {", s, nsize);
+    static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
+    printf("}\n");
+#endif
+}
+template <typename T>
+__host__ __device__ void print_array_v2(const char* s, T a)
+{
+    using data_type         = decltype(a.At(Number<0>{}));
+    constexpr index_t nsize = a.Size();
+#if 0
+    if constexpr(is_same<data_type, uint32_t>{})
+    {
+        printf("%s size %u, {", s, nsize);
+        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%u] %u, ", i.value, a[i]); });
+        printf("}\n");
+    }
+    else if constexpr(is_same<data_type, int32_t>{})
+    {
+        printf("%s size %d, {", s, nsize);
+        static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
+        printf("}\n");
+    }
+#else
+    printf("%s size %d, {", s, nsize);
+    static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("[%d] %d, ", i.value, a[i]); });
+    printf("}\n");
+#endif
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/print_array.hpp
+++ b/composable_kernel/include/utility/print_array.hpp
-#ifndef CK_PRINT_ARRAY_HPP
-#define CK_PRINT_ARRAY_HPP
-#include "array.hpp"
-namespace ck {
-template <index_t NSize>
-__host__ __device__ void print_array(const char* s, Array<uint32_t, NSize> a)
-{
-    constexpr index_t nsize = a.GetSize();
-    static_assert(nsize > 0 && nsize <= 10, "wrong!");
-    static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); });
-    static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); });
-    static_if<nsize == 3>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); });
-    static_if<nsize == 4>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); });
-    static_if<nsize == 5>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
-    });
-    static_if<nsize == 6>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
-    });
-    static_if<nsize == 7>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u %u}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6]);
-    });
-    static_if<nsize == 8>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u %u %u}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7]);
-    });
-    static_if<nsize == 9>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7],
-               a[8]);
-    });
-    static_if<nsize == 10>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7],
-               a[8],
-               a[9]);
-    });
-}
-template <index_t NSize>
-__host__ __device__ void print_array(const char* s, Array<int32_t, NSize> a)
-{
-    constexpr index_t nsize = a.GetSize();
-    static_assert(nsize > 0 && nsize <= 10, "wrong!");
-    static_if<nsize == 1>{}([&](auto) { printf("%s size %d, {%d}\n", s, nsize, a[0]); });
-    static_if<nsize == 2>{}([&](auto) { printf("%s size %d, {%d %d}\n", s, nsize, a[0], a[1]); });
-    static_if<nsize == 3>{}(
-        [&](auto) { printf("%s size %d, {%d %d %d}\n", s, nsize, a[0], a[1], a[2]); });
-    static_if<nsize == 4>{}(
-        [&](auto) { printf("%s size %d, {%d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3]); });
-    static_if<nsize == 5>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
-    });
-    static_if<nsize == 6>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
-    });
-    static_if<nsize == 7>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d %d}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6]);
-    });
-    static_if<nsize == 8>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d %d %d}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7]);
-    });
-    static_if<nsize == 9>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d %d %d %d}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7],
-               a[8]);
-    });
-    static_if<nsize == 10>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d %d %d %d %d}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7],
-               a[8],
-               a[9]);
-    });
-}
-} // namespace ck
-#endif
--- a/composable_kernel/include/utility/print_sequence.hpp
+++ b/composable_kernel/include/utility/print_sequence.hpp
-#ifndef CK_PRINT_SEQUENCE_HPP
-#define CK_PRINT_SEQUENCE_HPP
-#include "sequence.hpp"
-namespace ck {
-template <index_t... Xs>
-__host__ __device__ void print_sequence(const char* s, Sequence<Xs...>)
-{
-    constexpr index_t nsize = Sequence<Xs...>::Size();
-    static_assert(nsize <= 10, "wrong!");
-    static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });
-    static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); });
-    static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); });
-    static_if<nsize == 3>{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); });
-    static_if<nsize == 4>{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); });
-    static_if<nsize == 5>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); });
-    static_if<nsize == 6>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); });
-    static_if<nsize == 7>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
-    static_if<nsize == 8>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
-    static_if<nsize == 9>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
-    static_if<nsize == 10>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
-}
-} // namespace ck
-#endif
--- a/composable_kernel/include/utility/sequence.hpp
+++ b/composable_kernel/include/utility/sequence.hpp
@@ -168,6 +168,14 @@ struct Sequence
    {
        return Sequence<f(Is)...>{};
    }
+    __host__ __device__ static void Print()
+    {
+        printf("{");
+        printf("size %d, ", index_t{Size()});
+        static_for<0, Size(), 1>{}([&](auto i) { printf("%d ", At(i).value); });
+        printf("}");
+    }
 };
 // merge sequence
@@ -750,6 +758,13 @@ __host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce,
    return typename sequence_reverse_inclusive_scan<Seq, Reduce, Init>::type{};
 }
+template <typename Seq, typename Reduce, index_t Init>
+__host__ __device__ constexpr auto reverse_exclusive_scan_sequence(Seq, Reduce, Number<Init>)
+{
+    return reverse_inclusive_scan_sequence(Seq::PopFront(), Reduce{}, Number<Init>{})
+        .PushBack(Number<Init>{});
+}
 template <typename Seq, typename Reduce, index_t Init>
 __host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number<Init>)
 {

--- a/composable_kernel/include/utility/sequence_helper.hpp
+++ b/composable_kernel/include/utility/sequence_helper.hpp
+#ifndef CK_SEQUENCE_HELPER_HPP
+#define CK_SEQUENCE_HELPER_HPP
+#include "sequence_helper.hpp"
+namespace ck {
+template <typename F, index_t N>
+__host__ __device__ constexpr auto generate_sequence(F, Number<N>)
+{
+    return typename sequence_gen<N, F>::type{};
+}
+} // namespace ck
+#endif