Tile program init bulk PR (#4)

Tile Program init bulk PR --------- Co-authored-by: zjing14 <zhangjing14@gmail.com> Co-authored-by: Po-Yen, Chen <PoYen.Chen@amd.com>

Tile program init bulk PR (#4)
Tile Program init bulk PR --------- Co-authored-by: zjing14 <zhangjing14@gmail.com> Co-authored-by: Po-Yen, Chen <PoYen.Chen@amd.com>
0e92deb7 · Chao Liu · GitHub · 0077eeb3 · 0e92deb7 · 0e92deb7
Unverified Commit 0e92deb7 authored Sep 05, 2023 by Chao Liu Committed by GitHub Sep 05, 2023
20 changed files
--- a/include/ck/utility/buffer_view.hpp
+++ b/include/ck/utility/buffer_view.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/buffer_view_declare.hpp"
+#include "ck/utility/buffer_view_impl_generic.hpp"
+#include "ck/utility/buffer_view_impl_global.hpp"
+#include "ck/utility/buffer_view_impl_lds.hpp"
+#include "ck/utility/buffer_view_impl_vgpr.hpp"
+
+namespace ck {
+
+template <AddressSpaceEnum BufferAddressSpace,
+          AmdBufferCoherenceEnum Coherence = AmdBufferCoherenceEnum::DefaultCoherence,
+          typename T,
+          typename BufferSizeType>
+__host__ __device__ constexpr auto make_buffer_view(T* p, BufferSizeType buffer_size)
+{
+    return BufferView<BufferAddressSpace, T, BufferSizeType, true, Coherence>{p, buffer_size};
+}
+
+template <
+    AddressSpaceEnum BufferAddressSpace,
+    AmdBufferCoherenceEnum Coherence = AmdBufferCoherenceEnum::DefaultCoherence,
+    typename T,
+    typename BufferSizeType,
+    typename X,
+    typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false>
+__host__ __device__ constexpr auto
+make_buffer_view(T* p, BufferSizeType buffer_size, X invalid_element_value)
+{
+    return BufferView<BufferAddressSpace, T, BufferSizeType, false, Coherence>{
+        p, buffer_size, invalid_element_value};
+}
+
+} // namespace ck
--- a/include/ck/utility/buffer_view_declare.hpp
+++ b/include/ck/utility/buffer_view_declare.hpp
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/enable_if.hpp"
+#include "ck/utility/c_style_pointer_cast.hpp"
+
+namespace ck {
+
+// T may be scalar or vector
+// X may be scalar or vector
+// T and X have same scalar type
+// X contains multiple T
+// FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
+//        transforms of TensorView/Tensor
+// FIXME: AmdBufferCoherenceEnum is only meaningful for buffer addressing. Need to split BufferView
+// definition for different memory address space (Global/GenericLds/Vgpr)
+template <AddressSpaceEnum BufferAddressSpace,
+          typename T,
+          typename BufferSizeType,
+          bool InvalidElementUseNumericalZeroValue,
+          AmdBufferCoherenceEnum Coherence = AmdBufferCoherenceEnum::DefaultCoherence>
+struct BufferView;
+
+} // namespace ck
--- a/include/ck/utility/buffer_view_impl_generic.hpp
+++ b/include/ck/utility/buffer_view_impl_generic.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/buffer_view_declare.hpp"
+#include "ck/utility/generic_memory_space_atomic.hpp"
+
+namespace ck {
+
+// Address Space: Generic
+// T may be scalar or vector
+// X may be scalar or vector
+// T and X have same scalar type
+// X contains multiple T
+// FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
+//        transforms of TensorView/Tensor
+template <typename T, typename BufferSizeType, bool InvalidElementUseNumericalZeroValue>
+struct BufferView<AddressSpaceEnum::Generic,
+                  T,
+                  BufferSizeType,
+                  InvalidElementUseNumericalZeroValue,
+                  AmdBufferCoherenceEnum::DefaultCoherence>
+{
+    using type = T;
+
+    T* p_data_ = nullptr;
+    BufferSizeType buffer_size_;
+    remove_cvref_t<T> invalid_element_value_ = T{0};
+
+    __host__ __device__ constexpr BufferView() : p_data_{}, buffer_size_{}, invalid_element_value_{}
+    {
+    }
+
+    __host__ __device__ constexpr BufferView(T* p_data, BufferSizeType buffer_size)
+        : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
+    {
+    }
+
+    __host__ __device__ constexpr BufferView(T* p_data,
+                                             BufferSizeType buffer_size,
+                                             T invalid_element_value)
+        : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
+    {
+    }
+
+    __device__ static constexpr AddressSpaceEnum GetAddressSpace()
+    {
+        return AddressSpaceEnum::Generic;
+    }
+
+    // i is offset of T
+    // FIXME: doesn't do is_valid check
+    __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+
+    // i is offset of T
+    // FIXME: doesn't do is_valid check
+    __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ constexpr auto Get(index_t i, bool is_valid_element) const
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        if(is_valid_element)
+        {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+            X tmp;
+
+            __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+
+            return tmp;
+#else
+            return *c_style_pointer_cast<const X*>(&p_data_[i]);
+#endif
+        }
+        else
+        {
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return X{0};
+            }
+            else
+            {
+                return X{invalid_element_value_};
+            }
+        }
+    }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <InMemoryDataOperationEnum Op,
+              typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ void Update(index_t i, bool is_valid_element, const X& x)
+    {
+        if constexpr(Op == InMemoryDataOperationEnum::Set)
+        {
+            this->template Set<X>(i, is_valid_element, x);
+        }
+        // FIXME: remove InMemoryDataOperationEnum::Add
+        else if constexpr(Op == InMemoryDataOperationEnum::Add)
+        {
+            auto tmp = this->template Get<X>(i, is_valid_element);
+            this->template Set<X>(i, is_valid_element, x + tmp);
+        }
+    }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ void Set(index_t i, bool is_valid_element, const X& x)
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        if(is_valid_element)
+        {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+            X tmp = x;
+
+            __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+#else
+            *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+#endif
+        }
+    }
+
+    // FIXME: remove
+    __device__ static constexpr bool IsStaticBuffer() { return false; }
+
+    // FIXME: remove
+    __device__ static constexpr bool IsDynamicBuffer() { return true; }
+
+    __host__ __device__ void Print() const
+    {
+        printf("BufferView{");
+
+        // AddressSpace
+        printf("AddressSpace: Generic, ");
+
+        // p_data_
+        printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
+
+        // buffer_size_
+        printf("buffer_size_: ");
+        print(buffer_size_);
+        printf(", ");
+
+        // invalid_element_value_
+        printf("invalid_element_value_: ");
+        print(invalid_element_value_);
+
+        printf("}");
+    }
+};
+
+} // namespace ck
--- a/include/ck/utility/buffer_view_impl_global.hpp
+++ b/include/ck/utility/buffer_view_impl_global.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/buffer_view.hpp"
+#include "ck/utility/amd_buffer_addressing.hpp"
+
+namespace ck {
+
+// Address Space: Global
+// T may be scalar or vector
+// X may be scalar or vector
+// T and X have same scalar type
+// X contains multiple T
+// FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
+//        transforms of TensorView/Tensor
+template <typename T,
+          typename BufferSizeType,
+          bool InvalidElementUseNumericalZeroValue,
+          AmdBufferCoherenceEnum Coherence>
+struct BufferView<AddressSpaceEnum::Global,
+                  T,
+                  BufferSizeType,
+                  InvalidElementUseNumericalZeroValue,
+                  Coherence>
+{
+    using type = T;
+
+    T* p_data_ = nullptr;
+    BufferSizeType buffer_size_;
+    remove_cvref_t<T> invalid_element_value_ = T{0};
+
+    __host__ __device__ constexpr BufferView() : p_data_{}, buffer_size_{}, invalid_element_value_{}
+    {
+    }
+
+    __host__ __device__ constexpr BufferView(T* p_data, BufferSizeType buffer_size)
+        : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
+    {
+    }
+
+    __host__ __device__ constexpr BufferView(T* p_data,
+                                             BufferSizeType buffer_size,
+                                             T invalid_element_value)
+        : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
+    {
+    }
+
+    __device__ static constexpr AddressSpaceEnum GetAddressSpace()
+    {
+        return AddressSpaceEnum::Global;
+    }
+
+    // i is offset of T
+    // FIXME: doesn't do is_valid check
+    __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+
+    // i is offset of T
+    // FIXME: doesn't do is_valid check
+    __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ constexpr auto Get(index_t i, bool is_valid_element) const
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+#if CK_USE_AMD_BUFFER_LOAD
+        bool constexpr use_amd_buffer_addressing = true;
+#else
+        bool constexpr use_amd_buffer_addressing = false;
+#endif
+
+        if constexpr(use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>,
+                                                                   t_per_x,
+                                                                   Coherence>(
+                    p_data_, i, is_valid_element, buffer_size_);
+            }
+            else
+            {
+                return amd_buffer_load_invalid_element_return_customized_value<remove_cvref_t<T>,
+                                                                               t_per_x,
+                                                                               Coherence>(
+                    p_data_, i, is_valid_element, buffer_size_, invalid_element_value_);
+            }
+        }
+        else
+        {
+            if(is_valid_element)
+            {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+                X tmp;
+
+                __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+
+                return tmp;
+#else
+                return *c_style_pointer_cast<const X*>(&p_data_[i]);
+#endif
+            }
+            else
+            {
+                if constexpr(InvalidElementUseNumericalZeroValue)
+                {
+                    return X{0};
+                }
+                else
+                {
+                    return X{invalid_element_value_};
+                }
+            }
+        }
+    }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <InMemoryDataOperationEnum Op,
+              typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ void Update(index_t i, bool is_valid_element, const X& x)
+    {
+        if constexpr(Op == InMemoryDataOperationEnum::Set)
+        {
+            this->template Set<X>(i, is_valid_element, x);
+        }
+        else if constexpr(Op == InMemoryDataOperationEnum::AtomicAdd)
+        {
+            this->template AtomicAdd<X>(i, is_valid_element, x);
+        }
+        else if constexpr(Op == InMemoryDataOperationEnum::AtomicMax)
+        {
+            this->template AtomicMax<X>(i, is_valid_element, x);
+        }
+        // FIXME: remove InMemoryDataOperationEnum::Add
+        else if constexpr(Op == InMemoryDataOperationEnum::Add)
+        {
+            auto tmp = this->template Get<X>(i, is_valid_element);
+            this->template Set<X>(i, is_valid_element, x + tmp);
+            // tmp += x;
+            // this->template Set<X>(i, is_valid_element, tmp);
+        }
+    }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ void Set(index_t i, bool is_valid_element, const X& x)
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+#if CK_USE_AMD_BUFFER_STORE
+        bool constexpr use_amd_buffer_addressing = true;
+#else
+        bool constexpr use_amd_buffer_addressing = false;
+#endif
+
+        if constexpr(use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+            amd_buffer_store<remove_cvref_t<T>, t_per_x, Coherence>(
+                x, p_data_, i, is_valid_element, buffer_size_);
+        }
+        else
+        {
+            if(is_valid_element)
+            {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+                X tmp = x;
+
+                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+#else
+                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+#endif
+            }
+        }
+    }
+
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
+    {
+        using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
+
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
+
+#if CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
+        bool constexpr use_amd_buffer_addressing =
+            is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
+            is_same_v<remove_cvref_t<scalar_t>, float> ||
+            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
+#elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
+        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, int32_t>;
+#elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
+        bool constexpr use_amd_buffer_addressing =
+            is_same_v<remove_cvref_t<scalar_t>, float> ||
+            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
+#else
+        bool constexpr use_amd_buffer_addressing = false;
+#endif
+
+        if constexpr(use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+            amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x, Coherence>(
+                x, p_data_, i, is_valid_element, buffer_size_);
+        }
+        else
+        {
+            if(is_valid_element)
+            {
+                atomic_add<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
+            }
+        }
+    }
+
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ void AtomicMax(index_t i, bool is_valid_element, const X& x)
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
+
+#if CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
+        using scalar_t                           = typename scalar_type<remove_cvref_t<T>>::type;
+        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, double>;
+#else
+        bool constexpr use_amd_buffer_addressing = false;
+#endif
+
+        if constexpr(use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+            amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
+                x, p_data_, i, is_valid_element, buffer_size_);
+        }
+        else if(is_valid_element)
+        {
+            atomic_max<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
+        }
+    }
+
+    // FIXME: remove
+    __device__ static constexpr bool IsStaticBuffer() { return false; }
+
+    // FIXME: remove
+    __device__ static constexpr bool IsDynamicBuffer() { return true; }
+
+    __host__ __device__ void Print() const
+    {
+        printf("BufferView{");
+
+        // AddressSpace
+        printf("AddressSpace: Global, ");
+
+        // p_data_
+        printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
+
+        // buffer_size_
+        printf("buffer_size_: ");
+        print(buffer_size_);
+        printf(", ");
+
+        // invalid_element_value_
+        printf("invalid_element_value_: ");
+        print(invalid_element_value_);
+
+        printf("}");
+    }
+};
+
+} // namespace ck
--- a/include/ck/utility/buffer_view_impl_lds.hpp
+++ b/include/ck/utility/buffer_view_impl_lds.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/buffer_view.hpp"
+
+namespace ck {
+
+// Address Space: LDS
+// T may be scalar or vector
+// X may be scalar or vector
+// T and X have same scalar type
+// X contains multiple T
+// FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
+//        transforms of TensorView/Tensor
+template <typename T, typename BufferSizeType, bool InvalidElementUseNumericalZeroValue>
+struct BufferView<AddressSpaceEnum::Lds,
+                  T,
+                  BufferSizeType,
+                  InvalidElementUseNumericalZeroValue,
+                  AmdBufferCoherenceEnum::DefaultCoherence>
+{
+    using type = T;
+
+    T* p_data_ = nullptr;
+    BufferSizeType buffer_size_;
+    remove_cvref_t<T> invalid_element_value_ = T{0};
+
+    __host__ __device__ constexpr BufferView() : p_data_{}, buffer_size_{}, invalid_element_value_{}
+    {
+    }
+
+    __host__ __device__ constexpr BufferView(T* p_data, BufferSizeType buffer_size)
+        : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
+    {
+    }
+
+    __host__ __device__ constexpr BufferView(T* p_data,
+                                             BufferSizeType buffer_size,
+                                             T invalid_element_value)
+        : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
+    {
+    }
+
+    __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpaceEnum::Lds; }
+
+    // i is offset of T
+    // FIXME: doesn't do is_valid check
+    __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+
+    // i is offset of T
+    // FIXME: doesn't do is_valid check
+    __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ constexpr auto Get(index_t i, bool is_valid_element) const
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        if(is_valid_element)
+        {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+            X tmp;
+
+            __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+
+            return tmp;
+#else
+            return *c_style_pointer_cast<const X*>(&p_data_[i]);
+#endif
+        }
+        else
+        {
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return X{0};
+            }
+            else
+            {
+                return X{invalid_element_value_};
+            }
+        }
+    }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <InMemoryDataOperationEnum Op,
+              typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ void Update(index_t i, bool is_valid_element, const X& x)
+    {
+        if constexpr(Op == InMemoryDataOperationEnum::Set)
+        {
+            this->template Set<X>(i, is_valid_element, x);
+        }
+        // FIXME: remove InMemoryDataOperationEnum::Add
+        else if constexpr(Op == InMemoryDataOperationEnum::Add)
+        {
+            auto tmp = this->template Get<X>(i, is_valid_element);
+            this->template Set<X>(i, is_valid_element, x + tmp);
+        }
+    }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ void Set(index_t i, bool is_valid_element, const X& x)
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+#if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
+        bool constexpr workaround_int8_ds_write_issue = true;
+#else
+        bool constexpr workaround_int8_ds_write_issue = false;
+#endif
+
+        if constexpr(is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value &&
+                     workaround_int8_ds_write_issue)
+        {
+            if(is_valid_element)
+            {
+                // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into inefficient
+                // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
+                // ds_write_b128
+                // TODO: remove this after compiler fix
+                static_assert((is_same<remove_cvref_t<T>, int8_t>::value &&
+                               is_same<remove_cvref_t<X>, int8_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x2_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x4_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x8_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x16_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8x4_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x4_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8x8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x8_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8x16_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x16_t>::value),
+                              "wrong! not implemented for this combination, please add "
+                              "implementation");
+
+                if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                             is_same<remove_cvref_t<X>, int8_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int8_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x2_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int16_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x4_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x8_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32x2_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x16_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32x4_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8x4_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x4_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8x8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x8_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32x2_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8x16_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x16_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32x4_t*>(&x);
+                }
+            }
+        }
+        else
+        {
+            if(is_valid_element)
+            {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+                X tmp = x;
+
+                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+#else
+                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+#endif
+            }
+        }
+    }
+
+    // FIXME: remove
+    __device__ static constexpr bool IsStaticBuffer() { return false; }
+
+    // FIXME: remove
+    __device__ static constexpr bool IsDynamicBuffer() { return true; }
+
+    __host__ __device__ void Print() const
+    {
+        printf("BufferView{");
+
+        // AddressSpace
+        printf("AddressSpace: Lds, ");
+
+        // p_data_
+        printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
+
+        // buffer_size_
+        printf("buffer_size_: ");
+        print(buffer_size_);
+        printf(", ");
+
+        // invalid_element_value_
+        printf("invalid_element_value_: ");
+        print(invalid_element_value_);
+
+        printf("}");
+    }
+};
+
+} // namespace ck
--- a/include/ck/utility/buffer_view_impl_vgpr.hpp
+++ b/include/ck/utility/buffer_view_impl_vgpr.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/buffer_view_declare.hpp"
+#include "ck/utility/generic_memory_space_atomic.hpp"
+
+namespace ck {
+
+// Address Space: Vgpr
+// T may be scalar or vector
+// X may be scalar or vector
+// T and X have same scalar type
+// X contains multiple T
+// FIXME: InvalidElementUseNumericalZeroValue and invalid_element_value_ should be a property of
+//        transforms of TensorView/Tensor
+template <typename T, typename BufferSizeType, bool InvalidElementUseNumericalZeroValue>
+struct BufferView<AddressSpaceEnum::Vgpr,
+                  T,
+                  BufferSizeType,
+                  InvalidElementUseNumericalZeroValue,
+                  AmdBufferCoherenceEnum::DefaultCoherence>
+{
+    using type = T;
+
+    T* p_data_ = nullptr;
+    BufferSizeType buffer_size_;
+    remove_cvref_t<T> invalid_element_value_ = T{0};
+
+    __host__ __device__ constexpr BufferView() : p_data_{}, buffer_size_{}, invalid_element_value_{}
+    {
+    }
+
+    __host__ __device__ constexpr BufferView(T* p_data, BufferSizeType buffer_size)
+        : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{0}
+    {
+    }
+
+    __host__ __device__ constexpr BufferView(T* p_data,
+                                             BufferSizeType buffer_size,
+                                             T invalid_element_value)
+        : p_data_{p_data}, buffer_size_{buffer_size}, invalid_element_value_{invalid_element_value}
+    {
+    }
+
+    __device__ static constexpr AddressSpaceEnum GetAddressSpace()
+    {
+        return AddressSpaceEnum::Vgpr;
+    }
+
+    // i is offset of T
+    // FIXME: doesn't do is_valid check
+    __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+
+    // i is offset of T
+    // FIXME: doesn't do is_valid check
+    __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ constexpr auto Get(index_t i, bool is_valid_element) const
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        if(is_valid_element)
+        {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+            X tmp;
+
+            __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+
+            return tmp;
+#else
+            return *c_style_pointer_cast<const X*>(&p_data_[i]);
+#endif
+        }
+        else
+        {
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return X{0};
+            }
+            else
+            {
+                return X{invalid_element_value_};
+            }
+        }
+    }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <InMemoryDataOperationEnum Op,
+              typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ void Update(index_t i, bool is_valid_element, const X& x)
+    {
+        if constexpr(Op == InMemoryDataOperationEnum::Set)
+        {
+            this->template Set<X>(i, is_valid_element, x);
+        }
+        // FIXME: remove InMemoryDataOperationEnum::Add
+        else if constexpr(Op == InMemoryDataOperationEnum::Add)
+        {
+            auto tmp = this->template Get<X>(i, is_valid_element);
+            this->template Set<X>(i, is_valid_element, x + tmp);
+        }
+    }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __device__ void Set(index_t i, bool is_valid_element, const X& x)
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        if(is_valid_element)
+        {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+            X tmp = x;
+
+            __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+#else
+            *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+#endif
+        }
+    }
+
+    // FIXME: remove
+    __device__ static constexpr bool IsStaticBuffer() { return false; }
+
+    // FIXME: remove
+    __device__ static constexpr bool IsDynamicBuffer() { return true; }
+
+    __host__ __device__ void Print() const
+    {
+        printf("BufferView{");
+
+        // AddressSpace
+        printf("AddressSpace: Vgpr, ");
+
+        // p_data_
+        printf("p_data_: %p, ", static_cast<void*>(const_cast<remove_cvref_t<T>*>(p_data_)));
+
+        // buffer_size_
+        printf("buffer_size_: ");
+        print(buffer_size_);
+        printf(", ");
+
+        // invalid_element_value_
+        printf("invalid_element_value_: ");
+        print(invalid_element_value_);
+
+        printf("}");
+    }
+};
+
+} // namespace ck
--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -4,10 +4,13 @@
 #pragma once

 #include "ck/ck.hpp"
+#include "ck/utility/static_assert.hpp"
+#include "ck/utility/remove_cvref.hpp"
+#include "ck/utility/is_static.hpp"
+#include "ck/utility/print.hpp"
 #include "ck/utility/array.hpp"
 #include "ck/utility/container_helper.hpp"
 #include "ck/utility/statically_indexed_array.hpp"
-#include "ck/utility/container_element_picker.hpp"
 #include "ck/utility/multi_index.hpp"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/functional.hpp"
@@ -18,23 +21,29 @@
 #include "ck/utility/ignore.hpp"
 #include "ck/utility/integral_constant.hpp"
 #include "ck/utility/math.hpp"
+#include "ck/utility/math_v2.hpp"
+#include "ck/utility/math_ext.hpp"
 #include "ck/utility/number.hpp"
 #include "ck/utility/sequence.hpp"
 #include "ck/utility/sequence_helper.hpp"
 #include "ck/utility/tuple.hpp"
 #include "ck/utility/tuple_helper.hpp"
+#include "ck/utility/tuple_of_sequence_to_array_of_array.hpp"
+#include "ck/utility/macro_func_array_to_sequence.hpp"
+#include "ck/utility/macro_func_array_of_array_to_tuple_of_sequence.hpp"
 #include "ck/utility/type.hpp"
 #include "ck/utility/type_convert.hpp"
 #include "ck/utility/magic_division.hpp"
 #include "ck/utility/c_style_pointer_cast.hpp"
-#include "ck/utility/is_known_at_compile_time.hpp"
 #include "ck/utility/transpose_vectors.hpp"
 #include "ck/utility/inner_product.hpp"
 #include "ck/utility/thread_group.hpp"
+#include "ck/utility/meta_data_buffer.hpp"
 #include "ck/utility/debug.hpp"

 #include "ck/utility/amd_buffer_addressing.hpp"
 #include "ck/utility/amd_wave_read_first_lane.hpp"
+#include "ck/utility/amd_warp_shuffle.hpp"
 #include "ck/utility/generic_memory_space_atomic.hpp"
 #include "ck/utility/get_id.hpp"
 #include "ck/utility/thread_group.hpp"

--- a/include/ck/utility/container_helper.hpp
+++ b/include/ck/utility/container_helper.hpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

-#ifndef CK_CONTAINER_HELPER_HPP
-#define CK_CONTAINER_HELPER_HPP
+#pragma once

 #include "sequence.hpp"
 #include "sequence_helper.hpp"
@@ -10,7 +9,6 @@
 #include "tuple.hpp"
 #include "tuple_helper.hpp"
 #include "statically_indexed_array.hpp"
-#include "container_element_picker.hpp"

 namespace ck {

@@ -46,7 +44,7 @@ container_reorder_given_new2old(const Array<TData, NSize>& old_array, Sequence<I

    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");

-    return make_array(old_array[Number<IRs>{}]...);
+    return make_array<remove_cvref_t<TData>>(old_array[IRs]...);
 }

 template <typename TData, index_t NSize, index_t... IRs>
@@ -208,10 +206,11 @@ container_reverse_inclusive_scan(const Array<TData, NSize>& x, Reduce f, TData i
    return y;
 }

-template <typename TData, index_t NSize, typename Reduce>
+template <typename TData, index_t NSize, typename Reduce, typename Init>
 __host__ __device__ constexpr auto
-container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
+container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, Init init)
 {
+#if 0
    Array<TData, NSize> y;

    TData r = init;
@@ -224,6 +223,21 @@ container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData i
    y(Number<0>{}) = r;

    return y;
+#else
+    Array<TData, NSize> y;
+
+    TData r = init;
+
+    for(index_t i = NSize - 1; i > 0; --i)
+    {
+        y(i) = r;
+        r    = f(r, x[i]);
+    }
+
+    y(0) = r;
+
+    return y;
+#endif
 }

 template <index_t... Is, typename Reduce, index_t Init>
@@ -326,7 +340,7 @@ template <typename T, index_t NX, index_t NY>
 __host__ __device__ constexpr auto container_concat(const Array<T, NX>& ax, const Array<T, NY>& ay)
 {
    return unpack2(
-        [&](auto&&... zs) { return make_array(std::forward<decltype(zs)>(zs)...); }, ax, ay);
+        [&](auto&&... zs) { return make_array<T>(std::forward<decltype(zs)>(zs)...); }, ax, ay);
 }

 template <typename... X, typename... Y>
@@ -345,35 +359,57 @@ __host__ __device__ constexpr auto container_concat(const Container& x)
 template <typename T, index_t N, index_t... Is>
 __host__ __device__ constexpr auto get_container_subset(const Array<T, N>& arr, Sequence<Is...>)
 {
-    static_assert(N >= sizeof...(Is), "wrong! size");
+    STATIC_ASSERT(N >= sizeof...(Is), "wrong! size");

-    return make_array(arr[Number<Is>{}]...);
+    if constexpr(sizeof...(Is) > 0)
+    {
+        return make_array<T>(arr[Is]...);
+    }
+    else
+    {
+        return Array<T, 0>{};
+    }
 }

 template <typename... Ts, index_t... Is>
 __host__ __device__ constexpr auto get_container_subset(const Tuple<Ts...>& tup, Sequence<Is...>)
 {
-    static_assert(sizeof...(Ts) >= sizeof...(Is), "wrong! size");
+    STATIC_ASSERT(sizeof...(Ts) >= sizeof...(Is), "wrong! size");

-    return make_tuple(tup[Number<Is>{}]...);
+    if constexpr(sizeof...(Is) > 0)
+    {
+        return make_tuple(tup[Number<Is>{}]...);
+    }
+    else
+    {
+        return Tuple<>{};
+    }
 }

 template <typename T, index_t N, index_t... Is>
 __host__ __device__ constexpr void
 set_container_subset(Array<T, N>& y, Sequence<Is...> picks, const Array<T, sizeof...(Is)>& x)
 {
-    static_assert(N >= sizeof...(Is), "wrong! size");
+    STATIC_ASSERT(N >= sizeof...(Is), "wrong! size");

-    static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
+    if constexpr(sizeof...(Is) > 0)
+    {
+        for(index_t i = 0; i < picks.Size(); ++i)
+        {
+            y(picks[i]) = x[i];
+        }
+    }
 }

-template <typename... Ys, index_t... Is, typename... Xs>
-__host__ __device__ constexpr void
-set_container_subset(Tuple<Ys...>& y, Sequence<Is...> picks, const Tuple<Xs...>& x)
+template <typename Y, typename X, index_t... Is>
+__host__ __device__ constexpr void set_container_subset(Y& y, Sequence<Is...> picks, const X& x)
 {
-    static_assert(sizeof...(Ys) >= sizeof...(Is) && sizeof...(Is) == sizeof...(Xs), "wrong! size");
+    STATIC_ASSERT(Y::Size() >= sizeof...(Is) && X::Size() == sizeof...(Is), "wrong! size");

-    static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
+    if constexpr(sizeof...(Is) > 0)
+    {
+        static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
+    }
 }

 template <index_t... Is>
@@ -390,4 +426,3 @@ __host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence<Is...>)
 }

 } // namespace ck
-#endif
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -3,396 +3,39 @@

 #pragma once

-#include "ck/ck.hpp"
-#include "ck/utility/data_type.hpp"
-#include "enable_if.hpp"
-#include "c_style_pointer_cast.hpp"
-#include "amd_buffer_addressing.hpp"
-#include "generic_memory_space_atomic.hpp"
+#include "buffer_view.hpp"
+
+// FIXME: deprecate DynamicBuffer, use BufferView instead

 namespace ck {

-// T may be scalar or vector
-// X may be scalar or vector
-// T and X have same scalar type
-// X contains multiple T
+// FIXME: deprecate DynamicBuffer, use BufferView instead
 template <AddressSpaceEnum BufferAddressSpace,
          typename T,
          typename ElementSpaceSize,
          bool InvalidElementUseNumericalZeroValue,
-          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
-struct DynamicBuffer
-{
-    using type = T;
-
-    T* p_data_;
-    ElementSpaceSize element_space_size_;
-    T invalid_element_value_ = T{0};
-
-    __host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
-        : p_data_{p_data}, element_space_size_{element_space_size}
-    {
-    }
-
-    __host__ __device__ constexpr DynamicBuffer(T* p_data,
-                                                ElementSpaceSize element_space_size,
-                                                T invalid_element_value)
-        : p_data_{p_data},
-          element_space_size_{element_space_size},
-          invalid_element_value_{invalid_element_value}
-    {
-    }
-
-    __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace()
-    {
-        return BufferAddressSpace;
-    }
-
-    __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
-
-    __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
-
-    template <typename X,
-              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                 bool>::type = false>
-    __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const
-    {
-        // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
-
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
-
-        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
-                      "wrong! X should contain multiple T");
-
-#if CK_USE_AMD_BUFFER_LOAD
-        bool constexpr use_amd_buffer_addressing = true;
-#else
-        bool constexpr use_amd_buffer_addressing = false;
-#endif
-
-        if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
-        {
-            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
-            if constexpr(InvalidElementUseNumericalZeroValue)
-            {
-                return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>,
-                                                                   t_per_x,
-                                                                   coherence>(
-                    p_data_, i, is_valid_element, element_space_size_);
-            }
-            else
-            {
-                return amd_buffer_load_invalid_element_return_customized_value<remove_cvref_t<T>,
-                                                                               t_per_x,
-                                                                               coherence>(
-                    p_data_, i, is_valid_element, element_space_size_, invalid_element_value_);
-            }
-        }
-        else
-        {
-            if(is_valid_element)
-            {
-#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
-                X tmp;
-
-                __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
-
-                return tmp;
-#else
-                return *c_style_pointer_cast<const X*>(&p_data_[i]);
-#endif
-            }
-            else
-            {
-                if constexpr(InvalidElementUseNumericalZeroValue)
-                {
-                    return X{0};
-                }
-                else
-                {
-                    return X{invalid_element_value_};
-                }
-            }
-        }
-    }
-
-    template <InMemoryDataOperationEnum Op,
-              typename X,
-              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                 bool>::type = false>
-    __host__ __device__ void Update(index_t i, bool is_valid_element, const X& x)
-    {
-        if constexpr(Op == InMemoryDataOperationEnum::Set)
-        {
-            this->template Set<X>(i, is_valid_element, x);
-        }
-        else if constexpr(Op == InMemoryDataOperationEnum::AtomicAdd)
-        {
-            this->template AtomicAdd<X>(i, is_valid_element, x);
-        }
-        else if constexpr(Op == InMemoryDataOperationEnum::AtomicMax)
-        {
-            this->template AtomicMax<X>(i, is_valid_element, x);
-        }
-        else if constexpr(Op == InMemoryDataOperationEnum::Add)
-        {
-            auto tmp = this->template Get<X>(i, is_valid_element);
-            this->template Set<X>(i, is_valid_element, x + tmp);
-            // tmp += x;
-            // this->template Set<X>(i, is_valid_element, tmp);
-        }
-    }
-
-    template <typename X,
-              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                 bool>::type = false>
-    __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x)
-    {
-        // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
-
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
-
-        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
-                      "wrong! X should contain multiple T");
-
-#if CK_USE_AMD_BUFFER_STORE
-        bool constexpr use_amd_buffer_addressing = true;
-#else
-        bool constexpr use_amd_buffer_addressing      = false;
-#endif
-
-#if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
-        bool constexpr workaround_int8_ds_write_issue = true;
-#else
-        bool constexpr workaround_int8_ds_write_issue = false;
-#endif
-
-        if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
-        {
-            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
-            amd_buffer_store<remove_cvref_t<T>, t_per_x, coherence>(
-                x, p_data_, i, is_valid_element, element_space_size_);
-        }
-        else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds &&
-                          is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value &&
-                          workaround_int8_ds_write_issue)
-        {
-            if(is_valid_element)
-            {
-                // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into inefficient
-                // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
-                // ds_write_b128
-                // TODO: remove this after compiler fix
-                static_assert((is_same<remove_cvref_t<T>, int8_t>::value &&
-                               is_same<remove_cvref_t<X>, int8_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x2_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x4_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x8_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x16_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8x4_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x4_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8x8_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x8_t>::value) ||
-                                  (is_same<remove_cvref_t<T>, int8x16_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8x16_t>::value),
-                              "wrong! not implemented for this combination, please add "
-                              "implementation");
-
-                if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                             is_same<remove_cvref_t<X>, int8_t>::value)
-                {
-                    // HACK: cast pointer of x is bad
-                    // TODO: remove this after compiler fix
-                    *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
-                        *c_style_pointer_cast<const int8_t*>(&x);
-                }
-                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x2_t>::value)
-                {
-                    // HACK: cast pointer of x is bad
-                    // TODO: remove this after compiler fix
-                    *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
-                        *c_style_pointer_cast<const int16_t*>(&x);
-                }
-                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x4_t>::value)
-                {
-                    // HACK: cast pointer of x is bad
-                    // TODO: remove this after compiler fix
-                    *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
-                        *c_style_pointer_cast<const int32_t*>(&x);
-                }
-                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x8_t>::value)
-                {
-                    // HACK: cast pointer of x is bad
-                    // TODO: remove this after compiler fix
-                    *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
-                        *c_style_pointer_cast<const int32x2_t*>(&x);
-                }
-                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x16_t>::value)
-                {
-                    // HACK: cast pointer of x is bad
-                    // TODO: remove this after compiler fix
-                    *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
-                        *c_style_pointer_cast<const int32x4_t*>(&x);
-                }
-                else if constexpr(is_same<remove_cvref_t<T>, int8x4_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x4_t>::value)
-                {
-                    // HACK: cast pointer of x is bad
-                    // TODO: remove this after compiler fix
-                    *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
-                        *c_style_pointer_cast<const int32_t*>(&x);
-                }
-                else if constexpr(is_same<remove_cvref_t<T>, int8x8_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x8_t>::value)
-                {
-                    // HACK: cast pointer of x is bad
-                    // TODO: remove this after compiler fix
-                    *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
-                        *c_style_pointer_cast<const int32x2_t*>(&x);
-                }
-                else if constexpr(is_same<remove_cvref_t<T>, int8x16_t>::value &&
-                                  is_same<remove_cvref_t<X>, int8x16_t>::value)
-                {
-                    // HACK: cast pointer of x is bad
-                    // TODO: remove this after compiler fix
-                    *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
-                        *c_style_pointer_cast<const int32x4_t*>(&x);
-                }
-            }
-        }
-        else
-        {
-            if(is_valid_element)
-            {
-#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
-                X tmp = x;
-
-                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
-#else
-                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
-#endif
-            }
-        }
-    }
-
-    template <typename X,
-              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                 bool>::type = false>
-    __host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
-    {
-        using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
-
-        // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
-
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
-
-        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
-                      "wrong! X should contain multiple T");
-
-        static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
-
-#if CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
-        bool constexpr use_amd_buffer_addressing =
-            is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
-            is_same_v<remove_cvref_t<scalar_t>, float> ||
-            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
-#elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
-        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, int32_t>;
-#elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
-        bool constexpr use_amd_buffer_addressing =
-            is_same_v<remove_cvref_t<scalar_t>, float> ||
-            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
-#else
-        bool constexpr use_amd_buffer_addressing = false;
-#endif
-
-        if constexpr(use_amd_buffer_addressing)
-        {
-            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
-            amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
-                x, p_data_, i, is_valid_element, element_space_size_);
-        }
-        else
-        {
-            if(is_valid_element)
-            {
-                atomic_add<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
-            }
-        }
-    }
-
-    template <typename X,
-              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
-                                 bool>::type = false>
-    __host__ __device__ void AtomicMax(index_t i, bool is_valid_element, const X& x)
-    {
-        // X contains multiple T
-        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
-
-        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
-
-        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
-                      "wrong! X should contain multiple T");
-
-        static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
-
-#if CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
-        using scalar_t                           = typename scalar_type<remove_cvref_t<T>>::type;
-        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, double>;
-#else
-        bool constexpr use_amd_buffer_addressing = false;
-#endif
-
-        if constexpr(use_amd_buffer_addressing)
-        {
-            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
-            amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
-                x, p_data_, i, is_valid_element, element_space_size_);
-        }
-        else if(is_valid_element)
-        {
-            atomic_max<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
-        }
-    }
-
-    __host__ __device__ static constexpr bool IsStaticBuffer() { return false; }
-
-    __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
-};
-
+          AmdBufferCoherenceEnum Coherence = AmdBufferCoherenceEnum::DefaultCoherence>
+using DynamicBuffer = BufferView<BufferAddressSpace,
+                                 T,
+                                 ElementSpaceSize,
+                                 InvalidElementUseNumericalZeroValue,
+                                 Coherence>;
+
+// FIXME: deprecate make_dynamic_buffer, use make_buffer_view instead
 template <AddressSpaceEnum BufferAddressSpace,
-          AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
+          AmdBufferCoherenceEnum Coherence = AmdBufferCoherenceEnum::DefaultCoherence,
          typename T,
          typename ElementSpaceSize>
 __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
 {
-    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true, coherence>{
-        p, element_space_size};
+    return make_buffer_view<BufferAddressSpace, Coherence, T, ElementSpaceSize>(p,
+                                                                                element_space_size);
 }

+// FIXME: deprecate make_dynamic_buffer, use make_buffer_view instead
 template <
    AddressSpaceEnum BufferAddressSpace,
-    AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence,
+    AmdBufferCoherenceEnum Coherence = AmdBufferCoherenceEnum::DefaultCoherence,
    typename T,
    typename ElementSpaceSize,
    typename X,
@@ -400,8 +43,8 @@ template <
 __host__ __device__ constexpr auto
 make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value)
 {
-    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false, coherence>{
-        p, element_space_size, invalid_element_value};
+    return make_buffer_view<BufferAddressSpace, Coherence, T, ElementSpaceSize>(
+        p, element_space_size, invalid_element_value);
 }

 } // namespace ck
--- a/include/ck/utility/functional3.hpp
+++ b/include/ck/utility/functional3.hpp
@@ -20,7 +20,7 @@ struct static_ford_impl
 {
    __host__ __device__ constexpr static_ford_impl()
    {
-        static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
+        static_assert(RemainLengths::Size() > 0, "wrong! should not get here");
    }

    // F signature: F(Sequence<...>)
@@ -55,7 +55,7 @@ struct ford_impl
 {
    __host__ __device__ constexpr ford_impl()
    {
-        static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
+        static_assert(RemainLengths::Size() > 0, "wrong! should not get here");
    }

    // F signature: F(Array<...> multi_id)
@@ -92,13 +92,13 @@ struct ford_impl<Sequence<>, Orders>
 // will loop over each
 // dimension
 template <class Lengths,
-          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
+          class Orders = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
 struct static_ford
 {
    __host__ __device__ constexpr static_ford()
    {
-        static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
-        static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
+        static_assert(Lengths::Size() > 0, "wrong! Lengths is empty");
+        static_assert(Lengths::Size() == Orders::Size(), "wrong! inconsistent size");
    }

    // F signature: F(Sequence<...> multi_id)
@@ -117,13 +117,13 @@ struct static_ford
 // over each
 // dimension
 template <class Lengths,
-          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
+          class Orders = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
 struct ford
 {
    __host__ __device__ constexpr ford()
    {
-        static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
-        static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
+        static_assert(Lengths::Size() > 0, "wrong! Lengths is empty");
+        static_assert(Lengths::Size() == Orders::Size(), "wrong! inconsistent size");
    }

    // F signature: F(Array<...> multi_id)

--- a/include/ck/utility/functional4.hpp
+++ b/include/ck/utility/functional4.hpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

-#ifndef CK_FUNCTIONAL4_HPP
-#define CK_FUNCTIONAL4_HPP
+#pragma once

 #include "sequence.hpp"
 #include "tuple.hpp"
@@ -21,7 +20,11 @@ struct unpack_impl<Sequence<Is...>>
    template <typename F, typename X>
    __host__ __device__ constexpr auto operator()(F&& f, X&& x) const
    {
+#if 0
        return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...);
+#else
+        return std::forward<F>(f)(std::forward<X>(x).template At<Is>()...);
+#endif
    }
 };

@@ -35,8 +38,13 @@ struct unpack2_impl<Sequence<Is...>, Sequence<Js...>>
    template <typename F, typename X, typename Y>
    __host__ __device__ constexpr auto operator()(F&& f, X&& x, Y&& y) const
    {
+#if 0
        return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...,
                                  std::forward<Y>(y).At(Number<Js>{})...);
+#else
+        return std::forward<F>(f)(std::forward<X>(x).template At<Is>()...,
+                                  std::forward<Y>(y).template At<Js>()...);
+#endif
    }
 };

@@ -62,4 +70,3 @@ __host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y)
 }

 } // namespace ck
-#endif
--- a/include/ck/utility/get_id.hpp
+++ b/include/ck/utility/get_id.hpp
@@ -13,16 +13,24 @@ __host__ __device__ constexpr index_t get_warp_size()
    return warpSize;
 }

+__device__ index_t get_grid_size() { return gridDim.x; }
+
+__device__ index_t get_block_size() { return blockDim.x; }
+
+// TODO: deprecate these
 __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }

 __device__ index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; }

-__device__ index_t get_warp_local_1d_id() { return threadIdx.x / get_warp_size(); }
-
 __device__ index_t get_block_1d_id() { return blockIdx.x; }

-__device__ index_t get_grid_size() { return gridDim.x; }
+// Use these instead
+__device__ index_t get_lane_id() { return __lane_id(); }

-__device__ index_t get_block_size() { return blockDim.x; }
+__device__ index_t get_warp_id() { return threadIdx.x / get_warp_size(); }
+
+__device__ index_t get_thread_id() { return threadIdx.x; }
+
+__device__ index_t get_block_id() { return blockIdx.x; }

 } // namespace ck
--- a/include/ck/utility/integral_constant.hpp
+++ b/include/ck/utility/integral_constant.hpp
@@ -13,6 +13,8 @@ struct integral_constant
    typedef integral_constant type;
    __host__ __device__ constexpr operator value_type() const noexcept { return value; }
    __host__ __device__ constexpr value_type operator()() const noexcept { return value; }
+    __host__ __device__ static constexpr bool IsStatic() { return true; };
+    __host__ __device__ void Print() const { print(v); }
 };

 template <typename TX, TX X, typename TY, TY Y>

--- a/include/ck/utility/is_known_at_compile_time.hpp
+++ b/include/ck/utility/is_known_at_compile_time.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/ck.hpp"
-#include "integral_constant.hpp"
-#include "sequence.hpp"
-#include "tuple.hpp"
-
-namespace ck {
-
-template <typename T>
-struct is_known_at_compile_time;
-
-template <>
-struct is_known_at_compile_time<index_t>
-{
-    static constexpr bool value = false;
-};
-
-template <>
-struct is_known_at_compile_time<long_index_t>
-{
-    static constexpr bool value = false;
-};
-
-template <typename T, T X>
-struct is_known_at_compile_time<integral_constant<T, X>>
-{
-    static constexpr bool value = true;
-};
-
-template <index_t... Is>
-struct is_known_at_compile_time<Sequence<Is...>>
-{
-    static constexpr bool value = true;
-};
-
-template <typename... Ts>
-struct is_known_at_compile_time<Tuple<Ts...>>
-{
-    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
-    {
-        return container_reduce(
-            Tuple<Ts...>{},
-            [](auto x, bool r) {
-                return is_known_at_compile_time<remove_cvref_t<decltype(x)>>::value & r;
-            },
-            true);
-    }
-
-    static constexpr bool value = IsKnownAtCompileTime();
-};
-
-} // namespace ck
--- a/include/ck/utility/is_static.hpp
+++ b/include/ck/utility/is_static.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/remove_cvref.hpp"
+
+namespace ck {
+
+namespace detail {
+
+template <typename T>
+struct is_static_impl
+{
+    static constexpr bool value = T::IsStatic();
+};
+
+template <>
+struct is_static_impl<int32_t>
+{
+    static constexpr bool value = false;
+};
+
+template <>
+struct is_static_impl<int64_t>
+{
+    static constexpr bool value = false;
+};
+
+} // namespace detail
+
+template <typename T>
+using is_static = detail::is_static_impl<remove_cvref_t<T>>;
+
+template <typename T>
+inline constexpr bool is_static_v = is_static<T>::value;
+
+// TODO: deprecate this
+template <typename T>
+using is_known_at_compile_time = is_static<T>;
+
+} // namespace ck
--- a/include/ck/utility/macro_func_array_of_array_to_tuple_of_sequence.hpp
+++ b/include/ck/utility/macro_func_array_of_array_to_tuple_of_sequence.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/macro_func_array_to_sequence.hpp"
+
+// Macro function
+// convert constexpr Array<Array<index_t, xxx>, xxx> to Tuple<Sequence<...>, ...>
+//   Input:
+//     1. a_of_b_impl: constexpr Array<Array<index_t, xxx>, xxx>
+//     2. a_size: constexper index_t
+//     3. bs_sizes: constexpr Array<index_t, xxx>
+//   Output:
+//     Tuple<Sequence<...>, ...>
+#define TO_TUPLE_OF_SEQUENCE(a_of_b_impl, a_size, bs_sizes)             \
+    [a_of_b_impl, a_size, bs_sizes] {                                   \
+        return ck::generate_tuple(                                      \
+            [=](auto i) {                                               \
+                constexpr auto b_impl    = a_of_b_impl[i];              \
+                constexpr index_t b_size = bs_sizes[i];                 \
+                constexpr auto b         = TO_SEQUENCE(b_impl, b_size); \
+                return b;                                               \
+            },                                                          \
+            ck::Number<a_size>{});                                      \
+    }()
--- a/include/ck/utility/macro_func_array_to_sequence.hpp
+++ b/include/ck/utility/macro_func_array_to_sequence.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+// Macro function
+// convert constexpr Array to Sequence
+#define TO_SEQUENCE(a, n)                                                                      \
+    [a, n] {                                                                                   \
+        static_assert(a.Size() >= n, "wrong! out of bound");                                   \
+                                                                                               \
+        static_assert(n <= 10, "not implemented");                                             \
+                                                                                               \
+        if constexpr(n == 0)                                                                   \
+        {                                                                                      \
+            return ck::Sequence<>{};                                                           \
+        }                                                                                      \
+        else if constexpr(n == 1)                                                              \
+        {                                                                                      \
+            return ck::Sequence<a[0]>{};                                                       \
+        }                                                                                      \
+        else if constexpr(n == 2)                                                              \
+        {                                                                                      \
+            return ck::Sequence<a[0], a[1]>{};                                                 \
+        }                                                                                      \
+        else if constexpr(n == 3)                                                              \
+        {                                                                                      \
+            return ck::Sequence<a[0], a[1], a[2]>{};                                           \
+        }                                                                                      \
+        else if constexpr(n == 4)                                                              \
+        {                                                                                      \
+            return ck::Sequence<a[0], a[1], a[2], a[3]>{};                                     \
+        }                                                                                      \
+        else if constexpr(n == 5)                                                              \
+        {                                                                                      \
+            return ck::Sequence<a[0], a[1], a[2], a[3], a[4]>{};                               \
+        }                                                                                      \
+        else if constexpr(n == 6)                                                              \
+        {                                                                                      \
+            return ck::Sequence<a[0], a[1], a[2], a[3], a[4], a[5]>{};                         \
+        }                                                                                      \
+        else if constexpr(n == 7)                                                              \
+        {                                                                                      \
+            return ck::Sequence<a[0], a[1], a[2], a[3], a[4], a[5], a[6]>{};                   \
+        }                                                                                      \
+        else if constexpr(n == 8)                                                              \
+        {                                                                                      \
+            return ck::Sequence<a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]>{};             \
+        }                                                                                      \
+        else if constexpr(n == 9)                                                              \
+        {                                                                                      \
+            return ck::Sequence<a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8]>{};       \
+        }                                                                                      \
+        else if constexpr(n == 10)                                                             \
+        {                                                                                      \
+            return ck::Sequence<a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9]>{}; \
+        }                                                                                      \
+    }()
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -22,49 +22,25 @@ namespace ck {
 // TODO:
 //   1. Implement magic number divison for int32_t
 //   2. Implement magic number divison for unit32_t with 32-bit value range
-struct MagicDivision
+struct MagicDivision32BitRange
 {
    // uint32_t
    __host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
    {
-        // WARNING: magic division is only applicable for division inside this range.
-        // You should use the return value of CalculateMagicNumbers, if division is not inside this
-        // range. The "else" logic below is to quiet down run-time error.
-        if(divisor >= 1 && divisor <= INT32_MAX)
-        {
-            uint32_t shift = 0;
-            for(shift = 0; shift < 32; ++shift)
-            {
-                if((1U << shift) >= divisor)
-                {
-                    break;
-                }
-            }
-
-            uint64_t one        = 1;
-            uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
-            // assert(multiplier <= 0xffffffffUL);
-
-            return make_tuple(uint32_t(multiplier), shift);
-        }
-        else
-        {
-            return make_tuple(uint32_t(0), uint32_t(0));
-        }
-    }
+        // WARNING: magic division is only valid for division inside this range.
+        // assert(divisor >= 1 && divisor <= INT32_MAX)

-    __host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor)
-    {
-        auto tmp = CalculateMagicNumbers(divisor);
+        uint32_t shift_u32 = 0;

-        return tmp[Number<0>{}];
-    }
+        while((1U << shift_u32) < divisor)
+        {
+            shift_u32++;
+        };

-    __host__ __device__ static constexpr uint32_t CalculateMagicShift(uint32_t divisor)
-    {
-        auto tmp = CalculateMagicNumbers(divisor);
+        uint64_t tmp_u64        = ((1UL << shift_u32) - divisor) << 32;
+        uint32_t multiplier_u32 = tmp_u64 / divisor + 1;

-        return tmp[Number<1>{}];
+        return make_tuple(multiplier_u32, shift_u32);
    }

    // integral_constant<uint32_t, .>
@@ -81,58 +57,107 @@ struct MagicDivision
                          integral_constant<uint32_t, shift>{});
    }

-    template <uint32_t Divisor>
+    // integral_constant<int32_t, .>
+    template <int32_t Divisor>
    __host__ __device__ static constexpr auto
-        CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>)
+        CalculateMagicNumbers(integral_constant<int32_t, Divisor>)
    {
-        constexpr uint32_t multiplier = CalculateMagicMultiplier(uint32_t{Divisor});
+        return CalculateMagicNumbers(integral_constant<uint32_t, Divisor>{});
+    }

-        return integral_constant<uint32_t, multiplier>{};
+    // magic division for uint32_t
+    __device__ static constexpr uint32_t
+    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
+    {
+        uint32_t tmp = __umulhi(dividend, multiplier);
+        return (tmp + dividend) >> shift;
    }

-    template <uint32_t Divisor>
-    __host__ __device__ static constexpr auto
-        CalculateMagicShift(integral_constant<uint32_t, Divisor>)
+    __host__ static constexpr uint32_t
+    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
    {
-        constexpr uint32_t shift = CalculateMagicShift(uint32_t{Divisor});
+        uint32_t tmp = (static_cast<uint64_t>(dividend) * multiplier) >> 32;
+        return (tmp + dividend) >> shift;
+    }

-        return integral_constant<uint32_t, shift>{};
+    // magic division for int32_t
+    // HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
+    // non-negative for result to be correct
+    // TODO: figure out how to do magic number divison for int32_t as dividended
+    __device__ static constexpr int32_t
+    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
+    {
+        uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
+        uint32_t tmp          = __umulhi(dividend_u32, multiplier);
+        return (tmp + dividend_u32) >> shift;
    }

-    // integral_constant<int32_t, .>
-    template <int32_t Divisor>
-    __host__ __device__ static constexpr auto
-        CalculateMagicNumbers(integral_constant<int32_t, Divisor>)
+    __host__ static constexpr int32_t
+    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
    {
-        return CalculateMagicNumbers(integral_constant<uint32_t, Divisor>{});
+        uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
+        uint32_t tmp          = (static_cast<uint64_t>(dividend_u32) * multiplier) >> 32;
+        return (tmp + dividend_u32) >> shift;
    }
+};

-    template <int32_t Divisor>
+// magic number division
+// This version on works for divisor and dividended between [0, 1 << 16]
+struct MagicDivision16BitRange
+{
+    // uint32_t
+    __host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
+    {
+        // WARNING: magic division is only valid for division inside this range.
+        // assert(divisor >= 1 && divisor <= (1U << 16));
+
+        uint32_t shift_u32 = 0;
+
+        while((1U << shift_u32) < divisor)
+        {
+            shift_u32++;
+        };
+
+        uint32_t one            = 1;
+        uint32_t multiplier_u32 = ((one << 16) * ((one << shift_u32) - divisor)) / divisor + 1;
+
+        return make_tuple(multiplier_u32, shift_u32);
+    }
+
+    // integral_constant<uint32_t, .>
+    template <uint32_t Divisor>
    __host__ __device__ static constexpr auto
-        CalculateMagicMultiplier(integral_constant<int32_t, Divisor>)
+        CalculateMagicNumbers(integral_constant<uint32_t, Divisor>)
    {
-        return CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>{});
+        constexpr auto tmp = CalculateMagicNumbers(uint32_t{Divisor});
+
+        constexpr uint32_t multiplier = tmp[Number<0>{}];
+        constexpr uint32_t shift      = tmp[Number<1>{}];
+
+        return make_tuple(integral_constant<uint32_t, multiplier>{},
+                          integral_constant<uint32_t, shift>{});
    }

+    // integral_constant<int32_t, .>
    template <int32_t Divisor>
    __host__ __device__ static constexpr auto
-        CalculateMagicShift(integral_constant<int32_t, Divisor>)
+        CalculateMagicNumbers(integral_constant<int32_t, Divisor>)
    {
-        return CalculateMagicShift(integral_constant<uint32_t, Divisor>{});
+        return CalculateMagicNumbers(integral_constant<uint32_t, Divisor>{});
    }

    // magic division for uint32_t
    __device__ static constexpr uint32_t
    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
    {
-        uint32_t tmp = __umulhi(dividend, multiplier);
+        uint32_t tmp = (dividend * multiplier) >> 16;
        return (tmp + dividend) >> shift;
    }

    __host__ static constexpr uint32_t
    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
    {
-        uint32_t tmp = static_cast<uint64_t>(dividend) * multiplier >> 32;
+        uint32_t tmp = (dividend * multiplier) >> 16;
        return (tmp + dividend) >> shift;
    }

@@ -144,7 +169,7 @@ struct MagicDivision
    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
    {
        uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
-        uint32_t tmp          = __umulhi(dividend_u32, multiplier);
+        uint32_t tmp          = (dividend_u32 * multiplier) >> 16;
        return (tmp + dividend_u32) >> shift;
    }

@@ -152,11 +177,14 @@ struct MagicDivision
    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
    {
        uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
-        uint32_t tmp          = static_cast<uint64_t>(dividend_u32) * multiplier >> 32;
+        uint32_t tmp          = (dividend_u32 * multiplier) >> 16;
        return (tmp + dividend_u32) >> shift;
    }
 };

+// use 32bit version
+using MagicDivision = MagicDivision32BitRange;
+
 struct MDiv
 {
    // 1 dword -> 3 dword storage

--- a/include/ck/utility/math.hpp
+++ b/include/ck/utility/math.hpp
@@ -240,20 +240,37 @@ struct less
    __host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; }
 };

+__host__ __device__ constexpr int32_t next_power_of_two(int32_t x)
+{
+    // TODO: x need to be 2 ~ 0x7fffffff. 0, 1, or larger than 0x7fffffff will compile fail
+    return 1 << (32 - __builtin_clz(x - 1));
+}
+
 template <index_t X>
 __host__ __device__ constexpr auto next_power_of_two()
 {
-    // TODO: X need to be 2 ~ 0x7fffffff. 0, 1, or larger than 0x7fffffff will compile fail
-    constexpr index_t Y = 1 << (32 - __builtin_clz(X - 1));
-    return Y;
+    constexpr index_t y = next_power_of_two(X);
+    return Number<y>{};
 }

 template <index_t X>
-__host__ __device__ constexpr auto next_power_of_two(Number<X> x)
+__host__ __device__ constexpr auto next_power_of_two(Number<X>)
+{
+    constexpr index_t y = next_power_of_two(X);
+    return Number<y>{};
+}
+
+__host__ __device__ constexpr int32_t integer_log2_floor(int32_t x)
+{
+    // TODO: x need to be 1 ~ 0x7fffffff
+    // __builtin_clz will produce unexpected result if x is 0;
+    return 31 - __builtin_clz(x);
+}
+
+__host__ __device__ constexpr bool is_power_of_two_integer(int32_t x)
 {
-    // TODO: X need to be 2 ~ 0x7fffffff. 0, 1, or larger than 0x7fffffff will compile fail
-    constexpr index_t Y = 1 << (32 - __builtin_clz(x.value - 1));
-    return Number<Y>{};
+    // TODO: x need to be 1 ~ 0x7fffffff
+    return x == (1 << integer_log2_floor(x));
 }

 } // namespace math

--- a/include/ck/utility/math_ext.hpp
+++ b/include/ck/utility/math_ext.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+
+// Here should use MultiIndex<NSize>, instead of Tuple<Ys...>, although the former
+// is the alias of the latter. This is because compiler cannot infer the NSize if
+// using MultiIndex<NSize>
+// TODO: how to fix this?
+template <
+    typename... Ys,
+    typename X,
+    enable_if_t<!std::is_integral<X>::value && !std::is_floating_point<X>::value, bool> = false>
+__host__ __device__ constexpr auto operator+=(Tuple<Ys...>& y, const X& x)
+{
+    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Ys);
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
+    return y;
+}
+
+template <
+    typename... Ys,
+    typename X,
+    enable_if_t<!std::is_integral<X>::value && !std::is_floating_point<X>::value, bool> = false>
+__host__ __device__ constexpr auto operator-=(Tuple<Ys...>& y, const X& x)
+{
+    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Ys);
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
+    return y;
+}
+
+template <
+    typename... Xs,
+    typename Y,
+    enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> = false>
+__host__ __device__ constexpr auto operator+(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] + y[i]; });
+    return r;
+}
+
+template <
+    typename... Xs,
+    typename Y,
+    enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> = false>
+__host__ __device__ constexpr auto operator-(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] - y[i]; });
+    return r;
+}
+
+template <
+    typename... Xs,
+    typename Y,
+    enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> = false>
+__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] * y[i]; });
+    return r;
+}
+
+// MultiIndex = scalar * MultiIndex
+template <typename... Xs,
+          typename Y,
+          enable_if_t<std::is_integral<Y>::value || std::is_floating_point<Y>::value, bool> = false>
+__host__ __device__ constexpr auto operator*(Y a, const Tuple<Xs...>& x)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a * x[i]; });
+    return r;
+}
+
+// MultiIndex = MultiIndex * scalar
+template <typename... Xs,
+          typename Y,
+          enable_if_t<std::is_integral<Y>::value || std::is_floating_point<Y>::value, bool> = false>
+__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, Y a)
+{
+    return a * x;
+}
+
+template <typename... Xs, typename... Ys>
+__host__ __device__ constexpr auto operator/(const Tuple<Xs...>& x, const Tuple<Ys...>& y)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong!");
+
+    constexpr index_t NSize = sizeof...(Xs);
+
+    return generate_tuple([&](auto i) { return x[i] / y[i]; }, Number<NSize>{});
+}
+
+namespace mathext {
+
+template <typename... Xs>
+__host__ __device__ constexpr auto exp(const Tuple<Xs...>& x)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = math::exp(x[i]); });
+    return r;
+}
+
+template <typename... Xs, typename Y>
+__host__ __device__ constexpr auto max(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = math::max(x[i], y[i]); });
+    return r;
+}
+
+} // namespace mathext
+} // namespace ck