Merge branch 'develop' into codegen-enable-hiprtc

c2cf0733 · Po Yen Chen · GitHub · 7643c909 · c3a4800c · c2cf0733
Unverified Commit c2cf0733 authored Oct 31, 2024 by Po Yen Chen Committed by GitHub Oct 31, 2024
20 changed files
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -91,8 +91,10 @@ struct buffer_view<address_space_enum::generic,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
+    CK_TILE_DEVICE constexpr auto get(index_t i,
-    get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
+                                      index_t linear_offset,
+                                      bool is_valid_element,
+                                      bool_constant<oob_conditional_check> = {}) const
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -107,11 +109,11 @@ struct buffer_view<address_space_enum::generic,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
            X tmp;
-            __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+            __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));
            return tmp;
 #else
-            return *c_style_pointer_cast<const X*>(&p_data_[i]);
+            return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
 #endif
        }
        else
@@ -134,17 +136,17 @@ struct buffer_view<address_space_enum::generic,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        if constexpr(Op == memory_operation_enum::set)
        {
-            this->template set<X>(i, is_valid_element, x);
+            this->template set<X>(i, linear_offset, is_valid_element, x);
        }
        // FIXME: remove memory_operation_enum::add
        else if constexpr(Op == memory_operation_enum::add)
        {
-            auto tmp = this->template get<X>(i, is_valid_element);
+            auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
-            this->template set<X>(i, is_valid_element, x + tmp);
+            this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
        }
    }
@@ -154,7 +156,7 @@ struct buffer_view<address_space_enum::generic,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -169,9 +171,9 @@ struct buffer_view<address_space_enum::generic,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
            X tmp = x;
-            __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+            __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
 #else
-            *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+            *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
 #endif
        }
    }
@@ -276,8 +278,10 @@ struct buffer_view<address_space_enum::global,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
+    CK_TILE_DEVICE constexpr auto get(index_t i,
-    get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
+                                      index_t linear_offset,
+                                      bool is_valid_element,
+                                      bool_constant<oob_conditional_check> = {}) const
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -303,7 +307,7 @@ struct buffer_view<address_space_enum::global,
                                                                   t_per_x,
                                                                   Coherence,
                                                                   oob_conditional_check>(
-                    p_data_, i, is_valid_element, buffer_size_);
+                    p_data_, i + linear_offset, is_valid_element, buffer_size_);
            }
            else
            {
@@ -311,8 +315,11 @@ struct buffer_view<address_space_enum::global,
                    remove_cvref_t<T>,
                    t_per_x,
                    Coherence,
-                    oob_conditional_check>(
+                    oob_conditional_check>(p_data_,
-                    p_data_, i, is_valid_element, buffer_size_, invalid_element_value_);
+                                           i + linear_offset,
+                                           is_valid_element,
+                                           buffer_size_,
+                                           invalid_element_value_);
            }
        }
        else
@@ -322,11 +329,11 @@ struct buffer_view<address_space_enum::global,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
                X tmp;
-                __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+                __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));
                return tmp;
 #else
-                return *c_style_pointer_cast<const X*>(&p_data_[i]);
+                return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
 #endif
            }
            else
@@ -352,7 +359,8 @@ struct buffer_view<address_space_enum::global,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
    CK_TILE_DEVICE constexpr auto get_raw(remove_cvref_t<X>& dst,
-                                          index_t i,
+                                          index_t v_offset,
+                                          index_t i_offset,
                                          bool is_valid_element,
                                          bool_constant<pre_nop> = {}) const
    {
@@ -366,7 +374,38 @@ struct buffer_view<address_space_enum::global,
        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
        amd_buffer_load_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check, pre_nop>(
-            dst, cached_buf_res_, i, is_valid_element, bool_constant<pre_nop>{});
+            dst, cached_buf_res_, v_offset, i_offset, is_valid_element, bool_constant<pre_nop>{});
+    }
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE constexpr auto async_get(CK_TILE_LDS_ADDR remove_cvref_t<T>* smem,
+                                            index_t i,
+                                            index_t linear_offset,
+                                            bool is_valid_element,
+                                            bool_constant<oob_conditional_check> = {}) const
+    {
+        // X is vector of T
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+        amd_async_buffer_load_with_oob<remove_cvref_t<T>, t_per_x, Coherence>(
+            smem,
+            cached_buf_res_,
+            i,
+            linear_offset,
+            is_valid_element,
+            bool_constant<oob_conditional_check>{});
    }
    // i is offset of T, not X. i should be aligned to X
@@ -378,6 +417,7 @@ struct buffer_view<address_space_enum::global,
                  bool>::type = false>
    CK_TILE_DEVICE constexpr auto async_get_raw(remove_cvref_t<T>* smem,
                                                index_t i,
+                                                index_t linear_offset,
                                                bool /*is_valid_element*/,
                                                bool_constant<pre_nop> = {}) const
    {
@@ -391,7 +431,7 @@ struct buffer_view<address_space_enum::global,
        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
        amd_async_buffer_load_with_oob_raw<remove_cvref_t<T>, t_per_x, Coherence>(
-            smem, cached_buf_res_, i, bool_constant<pre_nop>{});
+            smem, cached_buf_res_, i, linear_offset, bool_constant<pre_nop>{});
    }
    // i is offset of T, not X. i should be aligned to X
@@ -401,25 +441,25 @@ struct buffer_view<address_space_enum::global,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        if constexpr(Op == memory_operation_enum::set)
        {
-            this->template set<X>(i, is_valid_element, x);
+            this->template set<X>(i, linear_offset, is_valid_element, x);
        }
        else if constexpr(Op == memory_operation_enum::atomic_add)
        {
-            this->template atomic_add<X>(i, is_valid_element, x);
+            this->template atomic_add<X>(i, linear_offset, is_valid_element, x);
        }
        else if constexpr(Op == memory_operation_enum::atomic_max)
        {
-            this->template atomic_max<X>(i, is_valid_element, x);
+            this->template atomic_max<X>(i, linear_offset, is_valid_element, x);
        }
        // FIXME: remove memory_operation_enum::add
        else if constexpr(Op == memory_operation_enum::add)
        {
-            auto tmp = this->template get<X>(i, is_valid_element);
+            auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
-            this->template set<X>(i, is_valid_element, x + tmp);
+            this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
            // tmp += x;
            // this->template set<X>(i, is_valid_element, tmp);
        }
@@ -432,7 +472,7 @@ struct buffer_view<address_space_enum::global,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -453,7 +493,7 @@ struct buffer_view<address_space_enum::global,
            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
            amd_buffer_store<remove_cvref_t<T>, t_per_x, Coherence>(
-                x, p_data_, i, is_valid_element, buffer_size_);
+                x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
        }
        else
        {
@@ -462,9 +502,9 @@ struct buffer_view<address_space_enum::global,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
                X tmp = x;
-                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+                __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
 #else
-                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+                *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
 #endif
            }
        }
@@ -477,7 +517,7 @@ struct buffer_view<address_space_enum::global,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void set_raw(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set_raw(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -489,7 +529,7 @@ struct buffer_view<address_space_enum::global,
        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
        amd_buffer_store_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check>(
-            x, p_data_, i, is_valid_element, buffer_size_);
+            x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
    }
    template <typename X,
@@ -497,7 +537,8 @@ struct buffer_view<address_space_enum::global,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void atomic_add(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void
+    atomic_add(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
@@ -532,13 +573,13 @@ struct buffer_view<address_space_enum::global,
        if constexpr(use_amd_buffer_addressing)
        {
            amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
-                x, p_data_, i, is_valid_element, buffer_size_);
+                x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
        }
        else
        {
            if(is_valid_element)
            {
-                atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i], x);
+                atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
            }
        }
    }
@@ -548,7 +589,8 @@ struct buffer_view<address_space_enum::global,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void atomic_max(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void
+    atomic_max(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -572,11 +614,11 @@ struct buffer_view<address_space_enum::global,
        if constexpr(use_amd_buffer_addressing)
        {
            amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
-                x, p_data_, i, is_valid_element, buffer_size_);
+                x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
        }
        else if(is_valid_element)
        {
-            atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i], x);
+            atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
        }
    }
@@ -668,8 +710,10 @@ struct buffer_view<address_space_enum::lds,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
+    CK_TILE_DEVICE constexpr auto get(index_t i,
-    get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
+                                      index_t linear_offset,
+                                      bool is_valid_element,
+                                      bool_constant<oob_conditional_check> = {}) const
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -684,14 +728,14 @@ struct buffer_view<address_space_enum::lds,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
            X tmp;
-            __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+            __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));
            return tmp;
 #else
            using buf_t = ext_vector_t<typename vector_traits<remove_cvref_t<T>>::scalar_type,
                                       scalar_per_t_vector * scalar_per_x_vector>;
            // using buf_t = ushort __attribute__((ext_vector_type(8)));
-            auto rtn = *c_style_pointer_cast<const buf_t*>(&p_data_[i]);
+            auto rtn = *c_style_pointer_cast<const buf_t*>(&p_data_[i + linear_offset]);
            return bit_cast<X>(rtn);
 #endif
        }
@@ -708,6 +752,23 @@ struct buffer_view<address_space_enum::lds,
        }
    }
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE constexpr auto get_raw(remove_cvref_t<X>& dst,
+                                          index_t v_offset,
+                                          index_t i_offset,
+                                          bool /*is_valid_element*/,
+                                          bool_constant<pre_nop> = {}) const
+    {
+        smem_load<sizeof(X)>{}(dst, v_offset * sizeof(T), i_offset * sizeof(T));
+    }
    // i is offset of T, not X. i should be aligned to X
    template <memory_operation_enum Op,
              typename X,
@@ -715,17 +776,17 @@ struct buffer_view<address_space_enum::lds,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        if constexpr(Op == memory_operation_enum::set)
        {
-            this->template set<X>(i, is_valid_element, x);
+            this->template set<X>(i, linear_offset, is_valid_element, x);
        }
        // FIXME: remove memory_operation_enum::add
        else if constexpr(Op == memory_operation_enum::add)
        {
-            auto tmp = this->template get<X>(i, is_valid_element);
+            auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
-            this->template set<X>(i, is_valid_element, x + tmp);
+            this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
        }
    }
@@ -735,7 +796,7 @@ struct buffer_view<address_space_enum::lds,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -751,6 +812,7 @@ struct buffer_view<address_space_enum::lds,
        bool constexpr workaround_int8_ds_write_issue = false;
 #endif
+        i += linear_offset; // simplicity
        if constexpr(std::is_same<typename vector_traits<remove_cvref_t<T>>::scalar_type,
                                  int8_t>::value &&
                     workaround_int8_ds_write_issue)
@@ -952,8 +1014,10 @@ struct buffer_view<address_space_enum::vgpr,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
+    CK_TILE_DEVICE constexpr auto get(index_t i,
-    get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
+                                      index_t /*linear_offset*/,
+                                      bool is_valid_element,
+                                      bool_constant<oob_conditional_check> = {}) const
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -995,17 +1059,17 @@ struct buffer_view<address_space_enum::vgpr,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        if constexpr(Op == memory_operation_enum::set)
        {
-            this->template set<X>(i, is_valid_element, x);
+            this->template set<X>(i, linear_offset, is_valid_element, x);
        }
        // FIXME: remove memory_operation_enum::add
        else if constexpr(Op == memory_operation_enum::add)
        {
-            auto tmp = this->template get<X>(i, is_valid_element);
+            auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
-            this->template set<X>(i, is_valid_element, x + tmp);
+            this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
        }
    }
@@ -1015,7 +1079,7 @@ struct buffer_view<address_space_enum::vgpr,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -1030,9 +1094,9 @@ struct buffer_view<address_space_enum::vgpr,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
            X tmp = x;
-            __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+            __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
 #else
-            *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+            *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
 #endif
        }
    }

--- a/include/ck_tile/core/tensor/load_tile.hpp
+++ b/include/ck_tile/core/tensor/load_tile.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -12,6 +12,7 @@
 #include "ck_tile/core/tensor/tile_window.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
+#include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/tensor/null_tile_window.hpp"
 #include "ck_tile/core/tensor/null_tensor.hpp"
@@ -28,9 +29,48 @@ CK_TILE_DEVICE auto load_tile(const tile_window_with_static_distribution<BottomT
                                                                         NumCoord>& tile_window,
                              bool_constant<oob_conditional_check> = {})
 {
-    return tile_window.load(bool_constant<oob_conditional_check>{});
+    return tile_window.load(number<-1>{}, bool_constant<oob_conditional_check>{});
 }
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          bool oob_conditional_check = true>
+CK_TILE_DEVICE auto load_tile(const tile_window_linear<BottomTensorView_,
+                                                       WindowLengths_,
+                                                       TileDistribution_,
+                                                       LinearBottomDims_>& tile_window,
+                              bool_constant<oob_conditional_check> = {})
+{
+    return tile_window.load(number<-1>{}, bool_constant<oob_conditional_check>{});
+}
+template <typename DistributedTensor_,
+          typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          index_t NumCoord,
+          bool oob_conditional_check = true>
+CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile,
+                              const tile_window_with_static_distribution<BottomTensorView_,
+                                                                         WindowLengths_,
+                                                                         TileDistribution_,
+                                                                         NumCoord>& tile_window,
+                              bool_constant<oob_conditional_check> = {})
+{
+    return tile_window.load(dst_tile, bool_constant<oob_conditional_check>{});
+}
+/**
+ * @brief Loads a tile of data using inline assembly.
+ *
+ * @note Bare in mind that loading data this way, you have to manually initialize your
+ *       thread buffer and synchronize load afterwards in order to make sure it's done before
+ *       using loaded data from registers
+ *       @see `tile_window_with_static_distribution::init_raw()` and `buffer_view.hpp`
+ *       @see  `buffer_load_fence()`
+ */
 template <typename T,
          typename BottomTensorView_,
          typename WindowLengths_,
@@ -46,7 +86,27 @@ CK_TILE_DEVICE auto load_tile_raw(T& tile,
                                  bool_constant<oob_conditional_check> = {},
                                  bool_constant<pre_nop>               = {})
 {
-    tile_window.load_raw(tile, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+    tile_window.load_raw(
+        tile, number<-1>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+}
+template <typename T,
+          typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          bool oob_conditional_check = true,
+          bool pre_nop               = false>
+CK_TILE_DEVICE auto load_tile_raw(T& tile,
+                                  const tile_window_linear<BottomTensorView_,
+                                                           WindowLengths_,
+                                                           TileDistribution_,
+                                                           LinearBottomDims_>& tile_window,
+                                  bool_constant<oob_conditional_check> = {},
+                                  bool_constant<pre_nop>               = {})
+{
+    tile_window.load_raw(
+        tile, number<-1>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
 }
 template <typename LdsTileWindow_,
@@ -66,7 +126,26 @@ async_load_tile_raw(LdsTileWindow_&& lds_tile,
                    bool_constant<pre_nop>               = {})
 {
    return tile_window.async_load_raw(
-        lds_tile, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+        lds_tile, number<-1>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+}
+template <typename LdsTileWindow_,
+          typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          bool oob_conditional_check = true,
+          bool pre_nop               = false>
+CK_TILE_DEVICE auto async_load_tile_raw(LdsTileWindow_&& lds_tile,
+                                        const tile_window_linear<BottomTensorView_,
+                                                                 WindowLengths_,
+                                                                 TileDistribution_,
+                                                                 LinearBottomDims_>& tile_window,
+                                        bool_constant<oob_conditional_check> = {},
+                                        bool_constant<pre_nop>               = {})
+{
+    return tile_window.async_load_raw(
+        lds_tile, number<-1>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
 }
 CK_TILE_DEVICE auto async_load_fence(index_t cnt = 0)

--- a/include/ck_tile/core/tensor/null_tile_window.hpp
+++ b/include/ck_tile/core/tensor/null_tile_window.hpp
@@ -80,6 +80,13 @@ CK_TILE_DEVICE constexpr auto make_tile_window(null_tensor_view,
    return null_tile_window<remove_cvref_t<WindowLengths>>{window_lengths};
 }
+template <typename WindowLengths, typename StaticTileDistribution>
+CK_TILE_DEVICE constexpr auto make_tile_window(const null_tile_window<WindowLengths>& t,
+                                               const StaticTileDistribution&)
+{
+    return t;
+}
 template <typename WindowLengths>
 CK_TILE_DEVICE void
 move_tile_window(null_tile_window<WindowLengths>&,

--- a/include/ck_tile/core/tensor/shuffle_tile.hpp
+++ b/include/ck_tile/core/tensor/shuffle_tile.hpp
@@ -109,7 +109,7 @@ CK_TILE_DEVICE void shuffle_tile_impl_in_thread(OutTensor& out_tensor, const InT
        // get input vectors
        static_for<0, num_vec_in, 1>{}([&](auto i) {
-            constexpr auto idx_y_in = generate_array(
+            constexpr auto idx_y_in = generate_tuple(
                [&](auto ii) {
                    return ii == y_dim_vec_out ? idx_y_start[ii] + i : idx_y_start[ii];
                },

--- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp
+++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
@@ -187,4 +187,18 @@ set_tile_if(static_distributed_tensor<DataType, StaticTileDistribution>& out_ten
    });
 }
+// this function used inside span loop over
+template <typename YLengths, index_t XUnpacks>
+CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks_from_x_unpacks(YLengths, number<XUnpacks>)
+{
+    constexpr auto y_size  = reduce_on_sequence(YLengths{}, multiplies{}, number<1>{});
+    constexpr auto y_packs = number<XUnpacks>{};
+    static_assert(y_size % y_packs == 0);
+    constexpr auto y_slice_size = y_size / y_packs;
+    constexpr auto slice_info = slice_sequence(YLengths{}, number<y_slice_size>{});
+    constexpr auto unpacks    = slice_info[number<1>{}];
+    return unpacks;
+}
 } // namespace ck_tile
--- a/include/ck_tile/core/tensor/store_tile.hpp
+++ b/include/ck_tile/core/tensor/store_tile.hpp
@@ -10,6 +10,7 @@
 #include "ck_tile/core/container/container_helper.hpp"
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
+#include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 namespace ck_tile {
@@ -72,7 +73,7 @@ store_tile(tile_window_with_static_distribution<BottomTensorView_,
                                                NumCoord>& tile_window,
           const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
 {
-    tile_window.store(dstr_tensor);
+    tile_window.store(dstr_tensor, number<-1>{});
 }
 template <typename BottomTensorView_,
@@ -87,7 +88,33 @@ store_tile_raw(tile_window_with_static_distribution<BottomTensorView_,
                                                    NumCoord>& tile_window,
               const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
 {
-    tile_window.store_raw(dstr_tensor);
+    tile_window.store_raw(dstr_tensor, number<-1>{});
+}
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          typename DataType_>
+CK_TILE_DEVICE void store_tile(
+    tile_window_linear<BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_>&
+        tile_window,
+    const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
+{
+    tile_window.store(dstr_tensor, number<-1>{});
+}
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          typename DataType_>
+CK_TILE_DEVICE void store_tile_raw(
+    tile_window_linear<BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_>&
+        tile_window,
+    const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
+{
+    tile_window.store_raw(dstr_tensor, number<-1>{});
 }
 } // namespace ck_tile
--- a/include/ck_tile/core/tensor/sweep_tile.hpp
+++ b/include/ck_tile/core/tensor/sweep_tile.hpp
@@ -8,6 +8,7 @@
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/tensor/tile_distribution.hpp"
 #include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/functional_with_tuple.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 namespace ck_tile {
@@ -27,4 +28,281 @@ CK_TILE_DEVICE void sweep_tile_span(TileDistributedSpan_, const F& f)
    });
 }
+// unpacked span, this version support span with unpack(multi-arg) functor
+//
+template <
+    typename TileDistributedSpan_, // tile_distributed_span<...>
+    typename F,                    // signature: F(tile_distributed_index<...>)
+    typename Unpacks = typename uniform_sequence_gen<TileDistributedSpan_::Impl::size(), 1>::type>
+CK_TILE_DEVICE void sweep_tile_uspan(TileDistributedSpan_, const F& f, Unpacks = {})
+{
+    using DstrSpan = remove_cvref_t<TileDistributedSpan_>;
+    static_uford<typename DstrSpan::Impl, Unpacks>{}(
+        [&](auto... dstr_idx_impl) { f(detail::make_tile_distributed_index(dstr_idx_impl)...); });
+}
+namespace impl {
+template <typename, typename, typename>
+struct sweep_tile_impl;
+template <typename DistributedTensor, typename UnpacksPerXDim, index_t I, index_t... Is>
+struct sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<I, Is...>>
+{
+    CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks() const
+    {
+        constexpr auto spans     = DistributedTensor::get_distributed_spans();
+        constexpr auto y_lengths = typename decltype(spans[number<I>{}])::Impl{};
+        constexpr auto x_unpacks = number<UnpacksPerXDim{}.at(number<I>{})>{};
+        constexpr auto y_unpacks = get_y_unpacks_from_x_unpacks(y_lengths, x_unpacks);
+        return y_unpacks;
+    }
+    CK_TILE_HOST_DEVICE constexpr index_t get_num_of_access() const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto u =
+            static_uford<typename decltype(spans[number<I>{}])::Impl, decltype(get_y_unpacks())>{};
+        return u.get_num_of_access() *
+               sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}
+                   .get_num_of_access();
+    }
+    template <typename F, typename SpanIdx>
+    CK_TILE_HOST_DEVICE constexpr void operator()(const F& f, const SpanIdx& span_idx) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        sweep_tile_uspan(
+            spans[number<I>{}],
+            [&](auto... i_idx) {
+                const auto next_span_idx = embed_tuples(
+                    [&](auto si) { return make_tuple(concat_tuple(si, make_tuple(i_idx))...); },
+                    span_idx);
+                sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}(
+                    f, next_span_idx);
+            },
+            get_y_unpacks());
+    }
+    template <typename F, typename SpanIdx, index_t i_access>
+    CK_TILE_HOST_DEVICE constexpr void
+    operator()(const F& f, const SpanIdx& span_idx, number<i_access>) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto u =
+            static_uford<typename decltype(spans[number<I>{}])::Impl, decltype(get_y_unpacks())>{};
+        constexpr auto access_stride =
+            sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}
+                .get_num_of_access();
+        constexpr auto curr_i_access = number<i_access / access_stride>{};
+        constexpr auto next_i_access = number<i_access % access_stride>{};
+        u(
+            [&](auto... i_idx) {
+                const auto next_span_idx = embed_tuples(
+                    [&](auto si) {
+                        return make_tuple(concat_tuple(
+                            si, make_tuple(detail::make_tile_distributed_index(i_idx)))...);
+                    },
+                    span_idx);
+                sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}(
+                    f, next_span_idx, next_i_access);
+            },
+            curr_i_access);
+    }
+};
+template <typename DistributedTensor, typename UnpacksPerXDim>
+struct sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<>>
+{
+    CK_TILE_HOST_DEVICE constexpr index_t get_num_of_access() const { return 1; }
+    template <typename F, typename SpanIdx>
+    CK_TILE_HOST_DEVICE constexpr void operator()(const F& f, const SpanIdx& span_idx) const
+    {
+        unpack(f, span_idx);
+    }
+    template <typename F, typename SpanIdx, index_t i_access>
+    CK_TILE_HOST_DEVICE constexpr void
+    operator()(const F& f, const SpanIdx& span_idx, number<i_access>) const
+    {
+        unpack(f, span_idx);
+    }
+};
+template <typename, typename, typename>
+struct sweep_tile_impl_0;
+// TODO: support empty tuple to remove this "entry-point" like function
+template <typename DistributedTensor, typename UnpacksPerXDim, index_t I, index_t... Is>
+struct sweep_tile_impl_0<DistributedTensor, UnpacksPerXDim, sequence<I, Is...>>
+{
+    CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks() const
+    {
+        constexpr auto spans     = DistributedTensor::get_distributed_spans();
+        constexpr auto y_lengths = typename decltype(spans[number<I>{}])::Impl{};
+        constexpr auto x_unpacks = number<UnpacksPerXDim{}.at(number<I>{})>{};
+        constexpr auto y_unpacks = get_y_unpacks_from_x_unpacks(y_lengths, x_unpacks);
+        return y_unpacks;
+    }
+    CK_TILE_HOST_DEVICE constexpr index_t get_num_of_access() const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto u =
+            static_uford<typename decltype(spans[number<I>{}])::Impl, decltype(get_y_unpacks())>{};
+        return u.get_num_of_access() *
+               sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}
+                   .get_num_of_access();
+    }
+    template <typename F>
+    CK_TILE_HOST_DEVICE constexpr void operator()(const F& f) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        sweep_tile_uspan(
+            spans[number<I>{}],
+            [&](auto... i_idx) {
+                constexpr auto next_span_idx = make_tuple(make_tuple(i_idx)...);
+                sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}(
+                    f, next_span_idx);
+            },
+            get_y_unpacks());
+    }
+    template <typename F, index_t i_access>
+    CK_TILE_HOST_DEVICE constexpr void operator()(const F& f, number<i_access>) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto u =
+            static_uford<typename decltype(spans[number<I>{}])::Impl, decltype(get_y_unpacks())>{};
+        constexpr auto access_stride =
+            sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}
+                .get_num_of_access();
+        constexpr auto curr_i_access = number<i_access / access_stride>{};
+        constexpr auto next_i_access = number<i_access % access_stride>{};
+        u(
+            [&](auto... i_idx) {
+                constexpr auto next_span_idx =
+                    make_tuple(make_tuple(detail::make_tile_distributed_index(i_idx))...);
+                sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}(
+                    f, next_span_idx, next_i_access);
+            },
+            curr_i_access);
+    }
+};
+} // namespace impl
+/*
+ * Enhanced sweep-tile utility, can control unpacks along each X-dim
+ * the lambda function argument is the distributed-idx, which can directly
+ * plugged into the distributed tensor as setter/getter
+ *
+ * e.g. below function, y with the type DistributedTensor, r is row scale
+ *
+ * // sweep tile 1 by 1
+ * sweep_tile<DistributedTensor>([&](auto idx) {
+ *     constexpr auto row_id = make_tuple(idx[number<0>{}]);
+ *     y(idx)                = y(idx) * r(row_id);
+ * });
+ *
+ * // sweep tile with 2 pixel from last dim each function call
+ * sweep_tile<DistributedTensor>(
+ *     [&](auto idx_0, auto idx_1) {
+ *         constexpr auto row_id = make_tuple(idx_0[number<0>{}]);
+ *         y(idx_0)              = y(idx_0) * r(row_id);
+ *         y(idx_1)              = y(idx_1) * r(row_id);
+ *     },
+ *     sequence<1, 2>{});
+ *
+ * // sweep tile with 2x2 pixel each function call
+ * sweep_tile<DistributedTensor>(
+ *     [&](auto idx_00, auto idx_01, auto idx_10, auto idx_11) {
+ *         constexpr auto row_id0 = make_tuple(idx_00[number<0>{}]);
+ *         constexpr auto row_id1 = make_tuple(idx_10[number<0>{}]);
+ *         y(idx_00)              = y(idx_00) * r(row_id0);
+ *         y(idx_01)              = y(idx_01) * r(row_id0);
+ *         y(idx_10)              = y(idx_10) * r(row_id1);
+ *         y(idx_11)              = y(idx_11) * r(row_id1);
+ *     },
+ *     sequence<2, 2>{});
+ *
+ * TODO: do we need constexpr? lambda function could be non-constexpr
+ */
+template <typename DistributedTensor,
+          typename F,
+          typename UnpacksPerXDim =
+              typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
+CK_TILE_HOST_DEVICE constexpr void sweep_tile(const F& f, UnpacksPerXDim = {})
+{
+    constexpr auto spans = DistributedTensor::get_distributed_spans();
+    impl::sweep_tile_impl_0<DistributedTensor,
+                            UnpacksPerXDim,
+                            typename arithmetic_sequence_gen<0, spans.size(), 1>::type>{}(f);
+}
+template <typename DistributedTensor,
+          typename F,
+          typename UnpacksPerXDim =
+              typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
+CK_TILE_HOST_DEVICE constexpr void
+sweep_tile(const DistributedTensor&, const F& f, UnpacksPerXDim = {})
+{
+    sweep_tile<DistributedTensor, F, UnpacksPerXDim>(f, UnpacksPerXDim{});
+}
+/*
+ * construct a sweep tile instance, which support issue the lambda one by one
+ * Note that this struct will hold the lambda functor, but will not hold the distributed tensor
+ * the functionality is the same as sweep_tile()
+ */
+template <typename DistributedTensor_,
+          typename F_,
+          typename UnpacksPerXDim_ =
+              typename uniform_sequence_gen<DistributedTensor_::get_num_of_dimension(), 1>::type>
+struct tile_sweeper
+{
+    using DistributedTensor = remove_cvref_t<DistributedTensor_>;
+    using F                 = remove_cvref_t<F_>;
+    using UnpacksPerXDim    = remove_cvref_t<UnpacksPerXDim_>;
+    CK_TILE_HOST_DEVICE tile_sweeper(const F& f_, UnpacksPerXDim = {}) : f(f_) {}
+    CK_TILE_HOST_DEVICE tile_sweeper(const DistributedTensor&, const F& f_, UnpacksPerXDim = {})
+        : f(f_)
+    {
+    }
+    CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_access()
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto tmp =
+            impl::sweep_tile_impl_0<DistributedTensor,
+                                    UnpacksPerXDim,
+                                    typename arithmetic_sequence_gen<0, spans.size(), 1>::type>{};
+        return tmp.get_num_of_access();
+    }
+    CK_TILE_HOST_DEVICE void operator()() const
+    {
+        sweep_tile<DistributedTensor>(f, UnpacksPerXDim{});
+    }
+    template <index_t i_access>
+    CK_TILE_HOST_DEVICE void operator()(number<i_access>) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        impl::sweep_tile_impl_0<DistributedTensor,
+                                UnpacksPerXDim,
+                                typename arithmetic_sequence_gen<0, spans.size(), 1>::type>{}(
+            f, number<i_access>{});
+    }
+    F f;
+};
+// partial deduction is not allowed
+// template <typename T, typename F, typename U>
+// CK_TILE_HOST_DEVICE_EXTERN tile_sweeper(const F&, U = {})->tile_sweeper<T, F, U>;
+// deduction guide
+template <typename T,
+          typename F,
+          typename U = typename uniform_sequence_gen<T::get_num_of_dimension(), 1>::type>
+CK_TILE_HOST_DEVICE_EXTERN tile_sweeper(const T&, const F&, U = {})->tile_sweeper<T, F, U>;
 } // namespace ck_tile
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -16,6 +16,24 @@
 namespace ck_tile {
+/*
+ * tensor_view
+ * abstract the underneath memory buffer(global, LDS, etc...)
+ * and provide a unified get/set function for access
+ *
+ * For addressing into the buffer we use 2 variable to control:
+ * coord : ND tensor coordinate, will calculate the actual offset inside
+ * linear_offset : 1D offset, will be used in the immediate field of
+ *   the buffer instruction to help reduce register usage
+ *
+ * User can use either of the field, or both to indexing into the tensor
+ *
+ * We usually provide 2 set of API for buffer get/set, e.g.
+ * get_vectorized_elements()/get_vectorized_elements_raw()
+ * the former usually will call intrinsic or normal C function, the later
+ * usually will call inline-asm function
+ *
+ */
 template <typename BufferView_,
          typename TensorDesc_,
          memory_operation_enum DstInMemOp_ = memory_operation_enum::set>
@@ -49,22 +67,6 @@ struct tensor_view
    CK_TILE_HOST_DEVICE constexpr auto& get_buffer_view() { return buf_; }
-#if 0
-    CK_TILE_HOST_DEVICE constexpr DataType get_element(const TensorCoord& coord) const
-    {
-        return buf_.template get<DataType>(
-            coord.get_offset(),
-            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord));
-    }
-    CK_TILE_HOST_DEVICE constexpr void set_element(const TensorCoord& coord, const DataType& x)
-    {
-        buf_.template set<DataType>(
-            coord.get_offset(),
-            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
-            x);
-    }
-#endif
    // X is vector of DataType.
    // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
    template <typename X,
@@ -75,14 +77,34 @@ struct tensor_view
                  bool>::type = false>
    CK_TILE_HOST_DEVICE constexpr remove_cvref_t<X>
    get_vectorized_elements(const TensorCoord& coord,
+                            index_t linear_offset,
                            bool_constant<oob_conditional_check> = {}) const
    {
        return buf_.template get<X>(
            coord.get_offset(),
+            linear_offset,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            bool_constant<oob_conditional_check>{});
    }
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr remove_cvref_t<X>
+    get_vectorized_elements(const TensorCoord& coord,
+                            index_t linear_offset,
+                            bool is_valid_element, // flag
+                            bool_constant<oob_conditional_check> = {}) const
+    {
+        return buf_.template get<X>(coord.get_offset(),
+                                    linear_offset,
+                                    is_valid_element,
+                                    bool_constant<oob_conditional_check>{});
+    }
    // X is vector of DataType.
    // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
    template <typename X,
@@ -94,12 +116,90 @@ struct tensor_view
                  bool>::type = false>
    CK_TILE_HOST_DEVICE void get_vectorized_elements_raw(remove_cvref_t<X>& dst,
                                                         const TensorCoord& coord,
+                                                         index_t linear_offset,
                                                         bool_constant<oob_conditional_check> = {},
                                                         bool_constant<pre_nop> = {}) const
    {
        return buf_.template get_raw<X, oob_conditional_check, pre_nop>(
            dst,
            coord.get_offset(),
+            linear_offset,
+            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
+            bool_constant<pre_nop>{});
+    }
+    template <typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE void get_vectorized_elements_raw(remove_cvref_t<X>& dst,
+                                                         const TensorCoord& coord,
+                                                         index_t linear_offset,
+                                                         bool is_valid_element,
+                                                         bool_constant<oob_conditional_check> = {},
+                                                         bool_constant<pre_nop> = {}) const
+    {
+        return buf_.template get_raw<X, oob_conditional_check, pre_nop>(
+            dst, coord.get_offset(), linear_offset, is_valid_element, bool_constant<pre_nop>{});
+    }
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    async_get_vectorized_elements(CK_TILE_LDS_ADDR remove_cvref_t<DataType>* smem,
+                                  const TensorCoord& coord,
+                                  index_t linear_offset) const
+    {
+        return buf_.template async_get<X>(
+            smem,
+            coord.get_offset(),
+            linear_offset,
+            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
+            bool_constant<oob_conditional_check>{});
+    }
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    async_get_vectorized_elements(CK_TILE_LDS_ADDR remove_cvref_t<DataType>* smem,
+                                  const TensorCoord& coord,
+                                  index_t linear_offset,
+                                  bool is_valid_element) const
+    {
+        return buf_.template async_get<X>(smem,
+                                          coord.get_offset(),
+                                          linear_offset,
+                                          is_valid_element,
+                                          bool_constant<oob_conditional_check>{});
+    }
+    template <typename X,
+              bool pre_nop = false,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    async_get_vectorized_elements_raw(remove_cvref_t<DataType>* smem,
+                                      const TensorCoord& coord,
+                                      index_t linear_offset,
+                                      bool_constant<pre_nop> = {}) const
+    {
+        return buf_.template async_get_raw<X>(
+            smem,
+            coord.get_offset(),
+            linear_offset,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            bool_constant<pre_nop>{});
    }
@@ -110,11 +210,15 @@ struct tensor_view
                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
                  bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void async_get_vectorized_elements_raw(
+    CK_TILE_HOST_DEVICE constexpr void
-        remove_cvref_t<DataType>* smem, const TensorCoord& coord, bool_constant<pre_nop> = {}) const
+    async_get_vectorized_elements_raw(remove_cvref_t<DataType>* smem,
+                                      const TensorCoord& coord,
+                                      index_t linear_offset,
+                                      bool is_valid_element,
+                                      bool_constant<pre_nop> = {}) const
    {
        return buf_.template async_get_raw<X>(
-            smem, coord.get_offset(), true /*not used*/, bool_constant<pre_nop>{});
+            smem, coord.get_offset(), linear_offset, is_valid_element, bool_constant<pre_nop>{});
    }
    // X is vector of DataType.
@@ -125,11 +229,15 @@ struct tensor_view
                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
                  bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void set_vectorized_elements(
+    CK_TILE_HOST_DEVICE constexpr void
-        const TensorCoord& coord, const X& x, bool_constant<oob_conditional_check> = {})
+    set_vectorized_elements(const TensorCoord& coord,
+                            index_t linear_offset,
+                            const X& x,
+                            bool_constant<oob_conditional_check> = {})
    {
        buf_.template set<X, oob_conditional_check>(
            coord.get_offset(),
+            linear_offset,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            x);
    }
@@ -140,15 +248,53 @@ struct tensor_view
                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
                  bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void set_vectorized_elements_raw(
+    CK_TILE_HOST_DEVICE constexpr void
-        const TensorCoord& coord, const X& x, bool_constant<oob_conditional_check> = {})
+    set_vectorized_elements(const TensorCoord& coord,
+                            index_t linear_offset,
+                            bool is_valid_element,
+                            const X& x,
+                            bool_constant<oob_conditional_check> = {})
+    {
+        buf_.template set<X, oob_conditional_check>(
+            coord.get_offset(), linear_offset, is_valid_element, x);
+    }
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    set_vectorized_elements_raw(const TensorCoord& coord,
+                                index_t linear_offset,
+                                const X& x,
+                                bool_constant<oob_conditional_check> = {})
    {
        buf_.template set_raw<X, oob_conditional_check>(
            coord.get_offset(),
+            linear_offset,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            x);
    }
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    set_vectorized_elements_raw(const TensorCoord& coord,
+                                index_t linear_offset,
+                                bool is_valid_element,
+                                const X& x,
+                                bool_constant<oob_conditional_check> = {})
+    {
+        buf_.template set_raw<X, oob_conditional_check>(
+            coord.get_offset(), linear_offset, is_valid_element, x);
+    }
    // X is vector of DataType.
    // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
    template <typename X,
@@ -157,15 +303,36 @@ struct tensor_view
                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
                  bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void update_vectorized_elements(
+    CK_TILE_HOST_DEVICE constexpr void
-        const TensorCoord& coord, const X& x, bool_constant<oob_conditional_check> = {})
+    update_vectorized_elements(const TensorCoord& coord,
+                               index_t linear_offset,
+                               const X& x,
+                               bool_constant<oob_conditional_check> = {})
    {
        buf_.template update<DstInMemOp, X, oob_conditional_check>(
            coord.get_offset(),
+            linear_offset,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            x);
    }
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    update_vectorized_elements(const TensorCoord& coord,
+                               index_t linear_offset,
+                               bool is_valid_element,
+                               const X& x,
+                               bool_constant<oob_conditional_check> = {})
+    {
+        buf_.template update<DstInMemOp, X, oob_conditional_check>(
+            coord.get_offset(), linear_offset, is_valid_element, x);
+    }
    CK_TILE_HOST_DEVICE void print() const
    {
        printf("tensor_view{");

--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -17,6 +17,14 @@
 namespace ck_tile {
+namespace detail {
+template <typename Distribution>
+CK_TILE_HOST_DEVICE auto get_partition_index(Distribution)
+{
+    return Distribution::_get_partition_index();
+}
+} // namespace detail
 // distributed span
 template <index_t... PartialHsLengths>
 struct tile_distributed_span
@@ -83,6 +91,21 @@ struct tile_distribution
    CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_dimension_p() { return NDimP; }
    CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_dimension_r() { return NDimR; }
+    CK_TILE_HOST_DEVICE static auto _get_partition_index()
+    {
+        // only support warp-tile and block-tile
+        static_assert(NDimP == 1 or NDimP == 2, "wrong!");
+        if constexpr(NDimP == 1)
+        {
+            return array<index_t, 1>{get_lane_id()};
+        }
+        else if constexpr(NDimP == 2)
+        {
+            return array<index_t, 2>{get_warp_id(), get_lane_id()};
+        }
+    }
    CK_TILE_HOST_DEVICE static constexpr auto get_lengths()
    {
 #if 0
@@ -149,6 +172,16 @@ struct tile_distribution
    }
 #endif
+    template <typename PartitionIndex = decltype(_get_partition_index())>
+    CK_TILE_HOST_DEVICE auto
+    calculate_index(const PartitionIndex& ps_idx = _get_partition_index()) const
+    {
+        const auto ps_ys_idx = container_concat(ps_idx, array<index_t, NDimY>{0});
+        const auto window_adaptor_thread_coord_tmp =
+            make_tensor_adaptor_coordinate(ps_ys_to_xs_, ps_ys_idx);
+        return window_adaptor_thread_coord_tmp.get_bottom_index();
+    }
    CK_TILE_HOST_DEVICE static constexpr auto get_distributed_spans()
    {
        constexpr auto distributed_spans_impl = DstrEncode::detail::distributed_spans_lengthss_;
@@ -421,6 +454,7 @@ struct tile_distribution_detail
 } // namespace detail
+#if 0
 // this returns a constexpr tile_distribution
 template <typename StaticTileDistributionEncoding_>
 CK_TILE_HOST_DEVICE constexpr auto make_tile_distribution(StaticTileDistributionEncoding_)
@@ -457,6 +491,7 @@ CK_TILE_HOST_DEVICE constexpr auto make_tile_distribution(StaticTileDistribution
        detail::tile_distribution_detail<remove_cvref_t<decltype(rh_major_minor_to_hidden_ids)>>>{
        ps_ys_to_xs_adaptor, ys_to_d_descriptor};
 }
+#endif
 // this returns a static tile_distribution
 template <typename StaticTileDistributionEncoding_>
@@ -499,129 +534,6 @@ CK_TILE_HOST_DEVICE constexpr auto make_static_tile_distribution(StaticTileDistr
 //***********************************************************************************
 namespace detail {
-template <typename Distribution>
-CK_TILE_HOST_DEVICE auto get_partition_index(Distribution)
-{
-    // only support warp-tile and block-tile
-    static_assert(Distribution::NDimP == 1 or Distribution::NDimP == 2, "wrong!");
-    if constexpr(Distribution::NDimP == 1)
-    {
-        return array<index_t, 1>{get_lane_id()};
-    }
-    else if constexpr(Distribution::NDimP == 2)
-    {
-        return array<index_t, 2>{get_warp_id(), get_lane_id()};
-    }
-}
-template <typename, typename, typename, index_t>
-struct reverse_slice_sequence_impl;
-template <index_t x,
-          index_t... xs,
-          index_t m,
-          index_t... ms,
-          index_t id,
-          index_t... ids,
-          index_t SliceSize>
-struct reverse_slice_sequence_impl<sequence<x, xs...>,
-                                   sequence<m, ms...>,
-                                   sequence<id, ids...>,
-                                   SliceSize>
-{
-    using old_scan =
-        reverse_slice_sequence_impl<sequence<xs...>, sequence<ms...>, sequence<ids...>, SliceSize>;
-    static constexpr auto slice_size = old_scan::remaining_slice_sizes::front().value;
-    static constexpr auto slice_length =
-        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
-    using dim_lengths =
-        typename sequence_merge<sequence<slice_length>, typename old_scan::dim_lengths>::type;
-    using dim_slices =
-        typename sequence_merge<sequence<x / slice_length>, typename old_scan::dim_slices>::type;
-    using remaining_slice_sizes = typename sequence_merge<
-        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>,
-        typename old_scan::remaining_slice_sizes>::type;
-    // the first idx that sliced length not equal to original length
-    static constexpr index_t _flag =
-        slice_length != x && remaining_slice_sizes{}.front().value == 1;
-    static constexpr index_t _split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
-    static constexpr index_t _split_idx =
-        std::conditional_t<_split_flag, number<id>, number<0>>::value;
-    static constexpr index_t split_flag = _split_flag || old_scan::split_flag;
-    static constexpr index_t split_idx  = std::
-        conditional_t<old_scan::split_flag, number<old_scan::split_idx>, number<_split_idx>>::value;
-};
-template <index_t x, index_t m, index_t id, index_t SliceSize>
-struct reverse_slice_sequence_impl<sequence<x>, sequence<m>, sequence<id>, SliceSize>
-{
-    static constexpr auto slice_size = SliceSize;
-    static constexpr auto slice_length =
-        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
-    using dim_lengths = sequence<slice_length>;
-    using dim_slices  = sequence<x / slice_length>;
-    using remaining_slice_sizes =
-        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>;
-    // the first idx that sliced length not equal to original length
-    static constexpr index_t _flag =
-        slice_length != x && remaining_slice_sizes{}.front().value == 1;
-    static constexpr index_t split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
-    static constexpr index_t split_idx =
-        std::conditional_t<split_flag, number<id>, number<0>>::value;
-};
-// clang-format off
-// input a sequence(with optional mask), and the SliceSize : size per slice
-// output the sequence each slice, and number of slices
-//
-// e.g. <2, 1, 4, 2>, 8     -> lengths:<1, 1, 4, 2>    , nums: <2, 1, 1, 1>    : 2 slices  , slice_idx: 0
-//      <4, 2, 4, 1, 2>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 1> : 16 slices , slice_idx: 2
-//      <4, 2, 4, 1, 6>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 3> : 48 slices , slice_idx: 2
-//      <4, 2, 5, 1, 2>, 10 -> lengths:<1, 1, 5, 1, 2> , nums: <4, 2, 1, 1, 1> : 8 slices  , slice_idx: 1
-//
-//      <4, 2, 8>, 64       -> lengths:<4, 2, 8>       , nums: <1, 1, 1>       : 1  slices , slice_idx: 0
-//      <4, 2, 8>, 32       -> lengths:<2, 2, 8>       , nums: <2, 1, 1>       : 2  slices , slice_idx: 0
-//      <4, 2, 8>, 16       -> lengths:<1, 2, 8>       , nums: <4, 1, 1>       : 4  slices , slice_idx: 0
-//      <4, 2, 8>, 8        -> lengths:<1, 1, 8>       , nums: <4, 2, 1>       : 8  slices , slice_idx: 1
-//      <4, 2, 8>, 4        -> lengths:<1, 1, 4>       , nums: <4, 2, 2>       : 16 slices , slice_idx: 2
-//      <4, 2, 8>, 2        -> lengths:<1, 1, 2>       , nums: <4, 2, 4>       : 32 slices , slice_idx: 2
-//      <4, 2, 8>, 1        -> lengths:<1, 1, 1>       , nums: <4, 2, 8>       : 64 slices , slice_idx: 2
-//
-//      <4, 2, 1, 4, 2> / 4 ->
-// mask:<1, 1, 1, 0, 1>,    -> lengths:<1, 2, 1, 4, 2> , nums: <4, 1, 1, 1, 1> : 8 slices  , slice_idx: 0
-//
-// return tuple<slice_lengths, slice_nums, slice_index>, slice_index is at which index will start
-// have split slices (right -> left)
-//  or the first index that sliced length is different from the original length
-// clang-format on
-template <typename Seq,
-          index_t SliceSize,
-          typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
-constexpr auto reverse_slice_sequence(Seq,
-                                      number<SliceSize>,
-                                      Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
-{
-    static_assert(Seq::size() == Mask::size());
-    using sliced_type =
-        reverse_slice_sequence_impl<Seq,
-                                    Mask,
-                                    typename arithmetic_sequence_gen<0, Seq::size(), 1>::type,
-                                    SliceSize>;
-    static_assert(sliced_type::remaining_slice_sizes::front().value == 1,
-                  "can not evenly divide this sequence, please check");
-    return make_tuple(typename sliced_type::dim_lengths{},
-                      typename sliced_type::dim_slices{},
-                      number<sliced_type::split_idx>{});
-}
 //
 // slice tensor from x_dim, result in split in y_dim, not p_dim.
 // We don't support slice cross p_dim (aka, slice different threads)

--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -18,6 +18,8 @@
 namespace ck_tile {
+// Note: this tile window do not support single issue
+// you need to use tile_window_linear structure for this purpose
 template <typename BottomTensorView_,
          typename WindowLengths_,
          typename StaticTileDistribution_,
@@ -41,6 +43,7 @@ struct tile_window_with_static_distribution
    static constexpr auto I0 = number<0>{};
    static constexpr auto I1 = number<1>{};
+    static_assert(NumCoord == 1);
    // TODO: check WindowLengths and StaticTileDistribution are consistent
@@ -189,7 +192,8 @@ struct tile_window_with_static_distribution
            constexpr auto idx_diff_ys =
                SFC_Ys::get_step_between(number<0>{}, number<iCoord * NumAccessPerCoord>{});
-            constexpr auto idx_diff_ps_ys = container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+            constexpr auto idx_diff_ps_ys = container_concat(
+                generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}), idx_diff_ys);
            move_window_adaptor_and_bottom_tensor_thread_coordinate(
                window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -222,10 +226,11 @@ struct tile_window_with_static_distribution
    // move thread's window adaptor coordinate and bottom tensor coordinate
    // [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] ==> [x0', x1', ...] ==> [offset]
+    template <typename ATopIndex>
    CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(
        WindowAdaptorCoord& window_adaptor_thread_coord,
        BottomTensorCoord& bottom_tensor_thread_coord,
-        const AdaptorTopIndex& idx_diff_adaptor_top) const
+        const ATopIndex& idx_diff_adaptor_top) const
    {
        array<index_t, NDimBottomTensor> idx_diff_adaptor_bottom;
@@ -279,20 +284,28 @@ struct tile_window_with_static_distribution
                          get_container_subset(window_adaptor_ps_ys_vector_strides, y_dims));
    }
-    CK_TILE_DEVICE constexpr auto get_num_access() const { return load_store_traits::NumAccess; }
+    CK_TILE_DEVICE constexpr auto get_num_of_access() const { return load_store_traits::NumAccess; }
-    template <bool oob_conditional_check = true>
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
-    CK_TILE_DEVICE auto load(bool_constant<oob_conditional_check> = {}) const
+    CK_TILE_DEVICE auto load(number<i_access_unsupport_>          = {},
+                             bool_constant<oob_conditional_check> = {}) const
    {
-        using Traits = load_store_traits;
+        constexpr auto tile_dstr = TileDstr{};
+        auto dst_tensor          = make_static_distributed_tensor<DataType>(tile_dstr);
+        load(dst_tensor, bool_constant<oob_conditional_check>{});
+        return dst_tensor;
+    }
+    template <typename DistributedTensor, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load(DistributedTensor& dst_tensor,
+                             bool_constant<oob_conditional_check> = {}) const
+    {
+        using Traits   = load_store_traits;
        using vector_t = typename Traits::vector_t;
        using SFC_Ys   = typename Traits::SFC_Ys;
        constexpr auto tile_dstr = TileDstr{};
-        auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
        // loop over thread tensor space [y0, y1, ...]
        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
            /// TODO: use structure binding (to be captured later) if compiled in C++20
@@ -308,11 +321,11 @@ struct tile_window_with_static_distribution
                // read from bottom tensor
                const vector_t vec_value =
                    get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
-                        bottom_tensor_thread_coord, bool_constant<oob_conditional_check>{});
+                        bottom_tensor_thread_coord, 0, bool_constant<oob_conditional_check>{});
 #if 1
                // write into distributed tensor
                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
-                    constexpr auto idx_ys = generate_array(
+                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                            : idx_ys_start[jj];
@@ -338,20 +351,23 @@ struct tile_window_with_static_distribution
                {
                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
-                    constexpr auto idx_diff_ps_ys =
+                    constexpr auto idx_diff_ps_ys = container_concat(
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
                }
            });
        });
-        return dst_tensor;
    }
-    template <typename DstTile, bool oob_conditional_check = true, bool pre_nop = false>
+    template <typename DstTile,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true,
+              bool pre_nop                = false>
    CK_TILE_DEVICE void load_raw(DstTile& dst_tensor,
+                                 number<i_access_unsupport_>          = {},
                                 bool_constant<oob_conditional_check> = {},
                                 bool_constant<pre_nop>               = {}) const
    {
@@ -397,6 +413,7 @@ struct tile_window_with_static_distribution
                get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
                    dst_vec_tbuf.template at<d / Traits::ScalarPerVector>(),
                    bottom_tensor_thread_coord,
+                    0 /**/,
                    bool_constant<oob_conditional_check>{},
                    pre_nop_);
 #if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \
@@ -409,23 +426,24 @@ struct tile_window_with_static_distribution
                {
                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
-                    constexpr auto idx_diff_ps_ys =
+                    constexpr auto idx_diff_ps_ys = container_concat(
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
                }
            });
        });
-#if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE
-        asm volatile("; this inline asm is workaround to prevent compiler from using too much "
-                     "scratch memory" ::);
-#endif
    }
    // TODO: currently async load only implemented in inline asm
-    template <typename LdsTileWindow_, bool oob_conditional_check = true, bool pre_nop = false>
+    template <typename LdsTileWindow_,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true,
+              bool pre_nop                = false>
    CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_&& lds_tile,
+                                       number<i_access_unsupport_>          = {},
                                       bool_constant<oob_conditional_check> = {},
                                       bool_constant<pre_nop>               = {}) const
    {
@@ -467,7 +485,7 @@ struct tile_window_with_static_distribution
        // loop over thread tensor space [y0, y1, ...]
        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
-            // TODO: use structure binding (to be captured later) if compiled in C++20
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
@@ -482,15 +500,16 @@ struct tile_window_with_static_distribution
                // read from bottom tensor
                get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
-                    smem, bottom_tensor_thread_coord, pre_nop_);
+                    smem, bottom_tensor_thread_coord, 0, pre_nop_);
                // move thread coordinate
                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                {
                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
-                    constexpr auto idx_diff_ps_ys =
+                    constexpr auto idx_diff_ps_ys = container_concat(
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -501,8 +520,81 @@ struct tile_window_with_static_distribution
        });
    }
-    template <bool oob_conditional_check = true>
+    template <typename LdsTileWindow_,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true>
+    CK_TILE_DEVICE auto async_load(LdsTileWindow_&& lds_tile,
+                                   number<i_access_unsupport_>          = {},
+                                   bool_constant<oob_conditional_check> = {}) const
+    {
+        using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
+        using LdsDataType   = typename LdsTileWindow::DataType;
+        // issues * warps * lanes
+        static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+        // TODO: LDS offset is not good for intrinsic based implementation(compiler can't figure out
+        // dependency) hence avoid use offset based solution. size_per_buf should be zero (how to
+        // check?)
+        constexpr index_t size_per_buf =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<0>{}, number<0>{}));
+        constexpr index_t size_per_wave =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<1>{}, number<0>{})) -
+            size_per_buf;
+        constexpr index_t size_per_issue =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<1>{}, number<0>{}, number<0>{})) -
+            size_per_buf;
+        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
+        using Traits = load_store_traits;
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
+        // TODO: we force CK_TILE_LDS_ADDR
+        CK_TILE_LDS_ADDR LdsDataType* smem =
+            lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_ + m0_init_value;
+        // loop over thread tensor space [y0, y1, ...]
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
+            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
+            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
+            static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
+                constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
+                // read from bottom tensor
+                get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
+                    smem, bottom_tensor_thread_coord, 0, bool_constant<oob_conditional_check>{});
+                // move thread coordinate
+                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
+                {
+                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
+                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+                    smem += size_per_issue; // Note we manually increase the per-issue offset
+                }
+            });
+        });
+    }
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
    CK_TILE_DEVICE void store(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                              number<i_access_unsupport_>          = {},
                              bool_constant<oob_conditional_check> = {}) const
    {
        using Traits = load_store_traits;
@@ -515,7 +607,6 @@ struct tile_window_with_static_distribution
        // loop over thread tensor space [y0, y1, ...]
        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
-            /// TODO: use structure binding (to be captured later) if compiled in C++20
            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
@@ -530,7 +621,7 @@ struct tile_window_with_static_distribution
                vector_t vec_value;
                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
-                    constexpr auto idx_ys = generate_array(
+                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                            : idx_ys_start[jj];
@@ -548,15 +639,19 @@ struct tile_window_with_static_distribution
                // write into bottom tensor
                get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
-                    bottom_tensor_thread_coord, vec_value, bool_constant<oob_conditional_check>{});
+                    bottom_tensor_thread_coord,
+                    0,
+                    vec_value,
+                    bool_constant<oob_conditional_check>{});
                // move thread coordinate
                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                {
                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
-                    constexpr auto idx_diff_ps_ys =
+                    constexpr auto idx_diff_ps_ys = container_concat(
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -565,8 +660,9 @@ struct tile_window_with_static_distribution
        });
    }
-    CK_TILE_DEVICE void
+    template <index_t i_access_unsupport_ = -1>
-    store_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor) const
+    CK_TILE_DEVICE void store_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                                  number<i_access_unsupport_> = {}) const
    {
        using Traits = load_store_traits;
@@ -591,7 +687,7 @@ struct tile_window_with_static_distribution
                // read from distributed tensor
                vector_t vec_value;
                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
-                    constexpr auto idx_ys = generate_array(
+                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                            : idx_ys_start[jj];
@@ -606,15 +702,16 @@ struct tile_window_with_static_distribution
                // write into bottom tensor
                get_bottom_tensor_view()
                    .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
-                        bottom_tensor_thread_coord, vec_value);
+                        bottom_tensor_thread_coord, 0, vec_value);
                // move thread coordinate
                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                {
                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
-                    constexpr auto idx_diff_ps_ys =
+                    constexpr auto idx_diff_ps_ys = container_concat(
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -623,8 +720,9 @@ struct tile_window_with_static_distribution
        });
    }
-    template <bool oob_conditional_check = true>
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
    CK_TILE_DEVICE void update(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                               number<i_access_unsupport_>          = {},
                               bool_constant<oob_conditional_check> = {}) const
    {
        using Traits = load_store_traits;
@@ -650,7 +748,7 @@ struct tile_window_with_static_distribution
                vector_t vec_value;
                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
-                    constexpr auto idx_ys = generate_array(
+                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                            : idx_ys_start[jj];
@@ -666,15 +764,19 @@ struct tile_window_with_static_distribution
                // write into bottom tensor
                get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
-                    bottom_tensor_thread_coord, vec_value, bool_constant<oob_conditional_check>{});
+                    bottom_tensor_thread_coord,
+                    0,
+                    vec_value,
+                    bool_constant<oob_conditional_check>{});
                // move thread coordinate
                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                {
                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
-                    constexpr auto idx_diff_ps_ys =
+                    constexpr auto idx_diff_ps_ys = container_concat(
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -746,7 +848,8 @@ struct tile_window_with_static_distribution
            constexpr auto idx_diff_ys =
                SFC_Ys::get_step_between(number<0>{}, number<iCoord * NumAccessPerCoord>{});
-            constexpr auto idx_diff_ps_ys = container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+            constexpr auto idx_diff_ps_ys = container_concat(
+                generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}), idx_diff_ys);
            move_window_adaptor_and_bottom_tensor_thread_coordinate(
                window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -798,6 +901,27 @@ make_tile_window(const TensorView_& tensor_view,
        tensor_view, window_lengths, origin, tile_distribution};
 }
+// this version can't be called in a constexpr context
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          index_t NumCoord = 1>
+CK_TILE_DEVICE auto
+make_tile_window_raw(const TensorView_& tensor_view,
+                     const WindowLengths_& window_lengths,
+                     const multi_index<TensorView_::get_num_of_dimension()>& origin,
+                     const StaticTileDistribution_& tile_distribution,
+                     number<NumCoord> = {})
+{
+    auto w = tile_window_with_static_distribution<remove_cvref_t<TensorView_>,
+                                                  remove_cvref_t<WindowLengths_>,
+                                                  remove_cvref_t<StaticTileDistribution_>,
+                                                  NumCoord>{
+        tensor_view, window_lengths, origin, tile_distribution};
+    w.init_raw();
+    return w;
+}
 template <typename TensorView_,
          typename WindowLengths_,
          typename StaticTileDistribution_,
@@ -922,6 +1046,19 @@ make_tile_window(const tile_window_with_static_lengths<TensorView, WindowLengths
                            tile_distribution);
 }
+template <typename TensorView, typename WindowLengths, typename StaticTileDistribution>
+CK_TILE_DEVICE constexpr auto
+make_tile_window_raw(const tile_window_with_static_lengths<TensorView, WindowLengths>& tile_window,
+                     const StaticTileDistribution& tile_distribution)
+{
+    auto w = make_tile_window(tile_window.get_bottom_tensor_view(),
+                              tile_window.get_window_lengths(),
+                              tile_window.get_window_origin(),
+                              tile_distribution);
+    w.init_raw();
+    return w;
+}
 template <typename TensorView_, typename WindowLengths_>
 CK_TILE_DEVICE void move_tile_window(
    tile_window_with_static_lengths<TensorView_, WindowLengths_>& window,

--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/core/arch/utility.hpp"
+#include "ck_tile/core/algorithm/space_filling_curve.hpp"
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/container/array.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/tensor/static_distributed_tensor.hpp"
+#include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/tensor/tile_distribution.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+namespace ck_tile {
+#define WINDOW_DISPATCH_ISSUE()                                     \
+    if constexpr(i_access < 0)                                      \
+    {                                                               \
+        static_for<0, NumAccess, 1>{}([&](auto ia) { issue(ia); }); \
+    }                                                               \
+    else                                                            \
+    {                                                               \
+        static_assert(i_access < NumAccess);                        \
+        issue(number<i_access>{});                                  \
+    }
+//
+// This version of tile window will pre-cache offset/flags based on need
+//
+// LinearBottomDims_, e.g seq<0, 1> for 2d tensor, the last one is linear dim
+// so last dim can use immediate offset to indexing, can save register
+// TODO: if using this struct, better use load_raw()/store_raw(), can control
+//       the the immediate offset on the fly
+// space-filing-curve is non-snaked here!
+//
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_>
+struct tile_window_linear
+{
+    using BottomTensorView = remove_reference_t<BottomTensorView_>;
+    using WindowLengths    = remove_cvref_t<WindowLengths_>;
+    using TileDstr         = remove_cvref_t<StaticTileDistribution_>;
+    using WindowAdaptor    = typename TileDstr::PsYs2XsAdaptor;
+    using BottomTensorDesc = typename BottomTensorView::TensorDesc;
+    using DataType         = remove_cvref_t<typename BottomTensorView::DataType>;
+    using LinearBottomDims = remove_cvref_t<LinearBottomDims_>;
+    static_assert(LinearBottomDims::size() == BottomTensorView::get_num_of_dimension());
+    static constexpr index_t NDimWindowAdaptorTop = WindowAdaptor::get_num_of_top_dimension();
+    static constexpr index_t NDimBottomTensor     = BottomTensorDesc::get_num_of_dimension();
+    static constexpr index_t NDimP = TileDstr::get_num_of_dimension_p();
+    static constexpr index_t NDimY = TileDstr::get_num_of_dimension_y();
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    // TODO: check WindowLengths and StaticTileDistribution are consistent
+    static_assert(ck_tile::is_known_at_compile_time<WindowLengths>::value,
+                  "wrong! lengths should be static");
+    static_assert(TileDstr::is_static(), "wrong!");
+    static_assert(NDimBottomTensor == WindowAdaptor::get_num_of_bottom_dimension(),
+                  "wrong! inconsistent # of diemsnions");
+    using AdaptorTopIndex   = array<index_t, NDimWindowAdaptorTop>;
+    using BottomTensorIndex = array<index_t, NDimBottomTensor>;
+    using WindowAdaptorCoord =
+        decltype(make_tensor_adaptor_coordinate(WindowAdaptor{}, AdaptorTopIndex{}));
+    using BottomTensorCoord =
+        decltype(make_tensor_coordinate(BottomTensorDesc{}, BottomTensorIndex{}));
+    struct traits
+    {
+        private:
+        // return vector dimension among [y0, y1, ...]
+        CK_TILE_DEVICE static constexpr auto get_window_adaptor_ys_safe_vector_length_strides()
+        {
+            // bottom tensor top dimension vector lengths and strides
+            const auto [bottom_tensor_top_dim_vector_lengths,
+                        bottom_tensor_top_dim_vector_strides] =
+                BottomTensorDesc::get_top_dimension_safe_vector_length_strides();
+            // window vector lengths/strides
+            const auto window_adaptor_bottom_dim_vector_lengths =
+                bottom_tensor_top_dim_vector_lengths;
+            const auto window_adaptor_bottom_dim_vector_strides =
+                bottom_tensor_top_dim_vector_strides;
+            // window adaptor [p0, p1, ..., y0, y1, ...]
+            array<index_t, WindowAdaptor::get_num_of_hidden_dimension()>
+                window_adaptor_vector_lengths{-1};
+            array<index_t, WindowAdaptor::get_num_of_hidden_dimension()>
+                window_adaptor_vector_strides{-1};
+            constexpr auto window_adaptor_bottom_dims =
+                WindowAdaptor::get_bottom_dimension_hidden_ids();
+            set_container_subset(window_adaptor_vector_lengths,
+                                 window_adaptor_bottom_dims,
+                                 window_adaptor_bottom_dim_vector_lengths);
+            set_container_subset(window_adaptor_vector_strides,
+                                 window_adaptor_bottom_dims,
+                                 window_adaptor_bottom_dim_vector_strides);
+            const auto [window_adaptor_ps_ys_vector_lengths, window_adaptor_ps_ys_vector_strides] =
+                WindowAdaptor{}.get_top_dimension_safe_vector_length_strides(
+                    window_adaptor_vector_lengths, window_adaptor_vector_strides);
+            // [y0, y1, ...]
+            constexpr auto y_dims =
+                typename arithmetic_sequence_gen<TileDstr::get_num_of_dimension_p(),
+                                                 NDimWindowAdaptorTop,
+                                                 1>::type{};
+            return make_tuple(get_container_subset(window_adaptor_ps_ys_vector_lengths, y_dims),
+                              get_container_subset(window_adaptor_ps_ys_vector_strides, y_dims));
+        }
+        static constexpr auto get_vector_dim_y_scalar_per_vector()
+        {
+            const auto [ys_vector_lengths, ys_vector_strides] =
+                get_window_adaptor_ys_safe_vector_length_strides();
+            index_t VectorDimY_      = 0;
+            index_t ScalarPerVector_ = 1;
+            for(index_t i = 0; i < NDimY; ++i)
+            {
+                if(ys_vector_strides[i] == 1 && ys_vector_lengths[i] > ScalarPerVector_)
+                {
+                    ScalarPerVector_ = ys_vector_lengths[i];
+                    VectorDimY_      = i;
+                }
+            }
+            return make_tuple(VectorDimY_, ScalarPerVector_);
+        }
+        public:
+        static constexpr index_t VectorDimY = get_vector_dim_y_scalar_per_vector().template at<0>();
+        static constexpr index_t ScalarPerVector =
+            get_vector_dim_y_scalar_per_vector().template at<1>();
+        using vector_t = thread_buffer<DataType, ScalarPerVector>;
+        private:
+        static constexpr auto scalars_per_access_ = [] {
+            constexpr auto scalars_per_access_arr = generate_array(
+                [&](auto i) { return (i == VectorDimY) ? ScalarPerVector : 1; }, number<NDimY>{});
+            /// TODO: add non-automatic storage argument support to macro TO_SEQUENCE()
+            constexpr auto NDimY_ = NDimY;
+            return TO_SEQUENCE(scalars_per_access_arr, NDimY_);
+        }();
+        static constexpr auto get_space_filling_curve()
+        {
+            constexpr auto thread_tensor_lengths_ys =
+                to_sequence(TileDstr{}.get_ys_to_d_descriptor().get_lengths());
+            // FIXME: need logic to judge dim access order
+            using DimAccessOrder = typename arithmetic_sequence_gen<0, NDimY, 1>::type;
+            return space_filling_curve<decltype(thread_tensor_lengths_ys),
+                                       DimAccessOrder,
+                                       decltype(scalars_per_access_),
+                                       false /*!!! no snaked curve! */>{};
+        }
+        public:
+        using SFC_Ys = decltype(get_space_filling_curve());
+        static constexpr index_t NumAccess = SFC_Ys::get_num_of_access();
+        static_assert(0 < NumAccess, "Wrong! NumAccess should be larger than 0");
+        private:
+        static constexpr auto get_num_non_linear_access()
+        {
+            constexpr auto sfc_access_lens = SFC_Ys::access_lengths;
+            using ys_to_rhs_major =
+                typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+            constexpr auto non_linear = [&]() {
+                index_t cnt = 1;
+                static_for<0, NDimY, 1>{}([&](auto i_dim_y) {
+                    constexpr auto rhs_major    = ys_to_rhs_major{}[i_dim_y];
+                    constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
+                    if constexpr(LinearBottomDims{}[target_h_dim] == 0)
+                    {
+                        cnt *= sfc_access_lens[i_dim_y];
+                    }
+                });
+                return cnt;
+            }();
+            return non_linear;
+        }
+        // example:
+        // non_linear_access_map: sequence<0, 0, 0, 0, 1, 1, 1, 1> for 8 access, totally 2 register
+        // used
+        //  -> histogram : sequence<4, 4>
+        //  -> prefixsum : seqneuce<0, 4, 8>
+        // non_linear_access_map: sequence<0, 1, 2, 3, 4, 5, 6, 7> for 8 access, totally 8 register
+        // used, will pre-cache 8
+        //  -> histogram : sequence<1, 1, 1, 1, 1, 1, 1, 1>
+        //  -> prefixsum : seqneuce<0, 1, 2, 3, 4, 5, 6, 7, 8>
+        // non_linear_access_map: sequence<0, 0, 1, 1, 2, 2, 3, 3> for 8 access, totally 4 register
+        // used, will pre-cache 4
+        //  -> histogram : sequence<2, 2, 2, 2>
+        //  -> prefixsum : seqneuce<0, 2, 4, 6, 8>
+        static constexpr auto get_non_linear_access_map()
+        {
+            constexpr auto sfc_access_lens = SFC_Ys::access_lengths;
+            using ys_to_rhs_major =
+                typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+            constexpr auto non_linear_map = [&]() {
+                array<index_t, NumAccess> m_{0};
+                index_t cumulative_len_            = 1;
+                index_t cumulative_non_linear_len_ = 1;
+                static_for<0, NDimY, 1>{}([&](auto i_y) {
+                    constexpr auto i_dim_y       = number<NDimY - i_y - 1>{}; // from right to left
+                    constexpr auto rhs_major     = ys_to_rhs_major{}[i_dim_y];
+                    constexpr auto target_h_dim  = number<rhs_major - 1>{}; // no r dim here!
+                    constexpr auto is_linear_dim = LinearBottomDims{}[target_h_dim];
+                    array<index_t, NumAccess> current_m_{0};
+                    constexpr auto current_len_ = sfc_access_lens[i_dim_y];
+                    // copy cumulative length as current pattern
+                    for(auto i_ = 0; i_ < cumulative_len_; i_++)
+                    {
+                        current_m_(i_) = m_[i_];
+                    }
+                    for(auto j_ = 0; j_ < current_len_; j_++)
+                    {
+                        auto j_offset_ = is_linear_dim ? 0 : j_ * cumulative_non_linear_len_;
+                        for(auto i_ = 0; i_ < cumulative_len_; i_++)
+                        {
+                            m_(j_ * cumulative_len_ + i_) = current_m_[i_] + j_offset_;
+                        }
+                    }
+                    cumulative_len_ *= current_len_;
+                    if(!is_linear_dim)
+                        cumulative_non_linear_len_ *= current_len_;
+                });
+                return m_;
+            }();
+            return TO_SEQUENCE(non_linear_map, NumAccess);
+        }
+        static constexpr auto get_non_linear_access_histogram()
+        {
+            constexpr auto m_ = get_non_linear_access_map();
+            // m_.foo();
+            constexpr auto r_ =
+                typename arithmetic_sequence_gen<0, get_num_non_linear_access() + 1, 1>::type{};
+            constexpr auto h_ = histogram_sorted_sequence(m_, r_);
+            return h_;
+        }
+        static constexpr auto get_non_linear_access_histogram_prefix_sum()
+        {
+            constexpr auto h_            = get_non_linear_access_histogram();
+            constexpr auto h_prefix_sum_ = prefix_sum_sequence(h_);
+            return h_prefix_sum_;
+        }
+        public:
+        static constexpr index_t NumAccess_NonLinear = get_num_non_linear_access();
+        using AccessMap_NonLinear       = decltype(get_non_linear_access_map()); // sequence
+        using AccessHistogram_NonLinear = decltype(get_non_linear_access_histogram());
+        using AccessPrefixSum_NonLinear = decltype(get_non_linear_access_histogram_prefix_sum());
+    };
+    static constexpr index_t NumAccess           = traits::NumAccess;
+    static constexpr index_t NumAccess_NonLinear = traits::NumAccess_NonLinear;
+    using AccessMap_NonLinear                    = typename traits::AccessMap_NonLinear;
+    using AccessHistogram_NonLinear              = typename traits::AccessHistogram_NonLinear;
+    using AccessPrefixSum_NonLinear              = typename traits::AccessPrefixSum_NonLinear;
+    CK_TILE_DEVICE constexpr tile_window_linear() = default;
+    CK_TILE_DEVICE constexpr tile_window_linear(const BottomTensorView& bottom_tensor_view,
+                                                const WindowLengths& window_lengths,
+                                                const BottomTensorIndex& window_origin,
+                                                const TileDstr& tile_distribution)
+        : bottom_tensor_view_{bottom_tensor_view},
+          window_lengths_{window_lengths},
+          window_origin_{window_origin},
+          tile_dstr_{tile_distribution},
+          cached_coords_{},
+          cached_flags_{}
+    {
+        auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
+            tile_distribution.get_ps_ys_to_xs_adaptor(),
+            container_concat(make_tuple(get_warp_id(), get_lane_id()),
+                             generate_tuple([&](auto) { return number<0>{}; }, number<NDimY>{})));
+        BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
+            window_origin + window_adaptor_thread_coord_tmp.get_bottom_index();
+        auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
+            bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
+        // future load/store() calls (might allocate more registers)
+        using SFC_Ys = typename traits::SFC_Ys;
+        static_for<0, NumAccess, 1>{}([&](auto i_access) {
+            constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
+            constexpr auto need_save_non_linear_coord =
+                bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
+            if constexpr(need_save_non_linear_coord)
+            {
+                cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp;
+            }
+            // TODO: need pad_tensor_view to check which dim need use flag to check
+            //      cached flag is independent from non-linear-coord
+            //      but need be updated in move_tile, with proper dims
+            cached_flags_(i_access) = coordinate_has_valid_offset_assuming_top_index_is_valid(
+                bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_coord_tmp);
+            if constexpr(i_access != (NumAccess - 1))
+            {
+                constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number
+                constexpr auto idx_diff_ps_ys = container_concat(
+                    generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                    idx_diff_ys);
+                move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    window_adaptor_thread_coord_tmp,
+                    bottom_tensor_thread_coord_tmp,
+                    idx_diff_ps_ys);
+            }
+        });
+    }
+    CK_TILE_DEVICE static constexpr index_t get_num_of_dimension() { return NDimBottomTensor; }
+    CK_TILE_DEVICE static constexpr bool has_static_tile_distribution()
+    {
+        return TileDstr::is_static();
+    }
+    CK_TILE_DEVICE constexpr auto get_window_lengths() const { return window_lengths_; }
+    CK_TILE_DEVICE constexpr auto get_tile_distribution() const { return tile_dstr_; }
+    CK_TILE_DEVICE constexpr auto get_bottom_tensor_view() const { return bottom_tensor_view_; }
+    CK_TILE_DEVICE constexpr auto get_window_origin() const { return window_origin_; }
+    CK_TILE_DEVICE constexpr void
+    set_bottom_tensor_view_data_ptr(typename BottomTensorView::DataType* data)
+    {
+        bottom_tensor_view_.buf_.p_data_ = data;
+    }
+    // move thread's window adaptor coordinate and bottom tensor coordinate
+    // [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] ==> [x0', x1', ...] ==> [offset]
+    template <typename ATopIndex>
+    CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(
+        WindowAdaptorCoord& window_adaptor_thread_coord,
+        BottomTensorCoord& bottom_tensor_thread_coord,
+        const ATopIndex& idx_diff_adaptor_top) const
+    {
+        array<index_t, NDimBottomTensor> idx_diff_adaptor_bottom;
+        move_tensor_adaptor_coordinate(tile_dstr_.get_ps_ys_to_xs_adaptor(),
+                                       window_adaptor_thread_coord,
+                                       idx_diff_adaptor_top,
+                                       idx_diff_adaptor_bottom);
+        move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(),
+                               bottom_tensor_thread_coord,
+                               idx_diff_adaptor_bottom);
+    }
+    template <index_t i_access>
+    CK_TILE_DEVICE static constexpr auto get_bottom_linear_coordinate(number<i_access>)
+    {
+        using SFC_Ys          = typename traits::SFC_Ys;
+        constexpr auto idx_ys = SFC_Ys::get_index(number<i_access>{});
+        using ys_to_rhs_major =
+            typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+        constexpr auto modified_idx_ys = generate_tuple(
+            [&](auto i_dim_y) {
+                constexpr auto rhs_major    = ys_to_rhs_major{}[i_dim_y];
+                constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
+                if constexpr(LinearBottomDims{}[target_h_dim] == 0)
+                {
+                    return number<0>{};
+                }
+                else
+                {
+                    return number<idx_ys[i_dim_y]>{};
+                }
+            },
+            number<NDimY>{});
+        constexpr auto adaptor_ = TileDstr{}.get_ps_ys_to_xs_adaptor();
+        constexpr auto idx_ =
+            container_concat(make_tuple(number<0>{}, number<0>{}), modified_idx_ys);
+        return adaptor_.calculate_bottom_index(idx_);
+    }
+    template <index_t i_access>
+    CK_TILE_DEVICE static constexpr index_t get_bottom_linear_offset(number<i_access>)
+    {
+        constexpr auto linear_coord = get_bottom_linear_coordinate(number<i_access>{});
+        // since this is linear offset, we assum bottom X tensor is always linear
+        constexpr index_t linear_offset = [&]() {
+            constexpr auto x_idx_ = linear_coord;
+            constexpr auto x_len_ = TileDstr{}.get_lengths();
+            static_assert(x_idx_.size() == x_len_.size());
+            constexpr index_t x_dims_ = x_idx_.size();
+            index_t cu_stride_        = 1;
+            index_t cu_offset_        = 0;
+            static_for<0, x_dims_, 1>{}([&](auto i_) {
+                auto r_i_ = number<x_dims_ - i_ - 1>{};
+                cu_offset_ += x_idx_[r_i_] * cu_stride_;
+                cu_stride_ *= x_len_[r_i_];
+            });
+            return cu_offset_;
+        }();
+        return linear_offset;
+    }
+    CK_TILE_DEVICE constexpr auto get_num_of_access() const { return traits::NumAccess; }
+    template <index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load(number<i_access> = {}, bool_constant<oob_conditional_check> = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+        constexpr auto tile_dstr = TileDstr{};
+        auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+            constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
+            // read from bottom tensor
+            const vector_t vec_value =
+                get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
+                    bottom_tensor_thread_coord,
+                    linear_offset,
+                    bottom_tensor_flag,
+                    bool_constant<oob_conditional_check>{});
+#if 1
+            // data index [y0, y1, ...]
+            constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
+            // write into distributed tensor
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj];
+                    },
+                    number<NDimY>{});
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                dst_tensor.get_thread_buffer().template at<d>() =
+                    vec_value.template get_as<DataType>()[j];
+            });
+#else
+            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+            static_assert(d % traits::ScalarPerVector == 0);
+            dst_tensor.get_thread_buffer().template get_as<vector_t>()(
+                number<d / traits::ScalarPerVector>{}) = bit_cast<vector_t>(vec_value);
+#endif
+        };
+        WINDOW_DISPATCH_ISSUE();
+        return dst_tensor;
+    }
+    template <typename DstTile,
+              index_t i_access           = -1,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false>
+    CK_TILE_DEVICE void load_raw(DstTile& dst_tensor,
+                                 number<i_access> = {}, // negative means loop over all num_access
+                                 bool_constant<oob_conditional_check> = {},
+                                 bool_constant<pre_nop>               = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+        static constexpr index_t YElementSize =
+            TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
+        static_assert(YElementSize % traits::ScalarPerVector == 0);
+        using vectorized_tbuf = array<vector_t, YElementSize / traits::ScalarPerVector>;
+        constexpr auto tile_dstr = TileDstr{};
+        auto& dst_vec_tbuf = reinterpret_cast<vectorized_tbuf&>(dst_tensor.get_thread_buffer());
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess  = number<i_access_>{};
+            constexpr auto pre_nop_ = [&]() {
+                if constexpr(pre_nop && i_access_ == 0 &&
+                             BottomTensorView::buffer_view::get_address_space() ==
+                                 address_space_enum::global)
+                    return bool_constant<true>{};
+                else
+                    return bool_constant<false>{};
+            }();
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+            static_assert(d % traits::ScalarPerVector == 0);
+            get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
+                dst_vec_tbuf.template at<d / traits::ScalarPerVector>(),
+                bottom_tensor_thread_coord,
+                linear_offset /**/,
+                bottom_tensor_flag,
+                bool_constant<oob_conditional_check>{},
+                pre_nop_);
+#if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \
+    CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
+            asm volatile(""); // this is starting from rocm-6.2, but same sympton, reuse this flag
+#endif
+        };
+        WINDOW_DISPATCH_ISSUE();
+    }
+    // TODO: currently async load only implemented in inline asm
+    template <typename LdsTileWindow_,
+              index_t i_access           = -1,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false>
+    CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_&& lds_tile,
+                                       number<i_access>                     = {},
+                                       bool_constant<oob_conditional_check> = {},
+                                       bool_constant<pre_nop>               = {}) const
+    {
+        using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
+        using LdsDataType   = typename LdsTileWindow::DataType;
+        // currently we only support everything is non linear dim
+        // actually it's not performant if we have linear dim(e.g. fast changing)
+        static_assert(NumAccess_NonLinear == NumAccess);
+        static_assert(BottomTensorView::buffer_view::get_address_space() ==
+                      address_space_enum::global);
+        // issues * warps * lanes
+        static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+        const index_t size_per_buf =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<0>{}, number<0>{})) *
+            sizeof(LdsDataType);
+        const index_t size_per_wave =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<1>{}, number<0>{})) *
+                sizeof(LdsDataType) -
+            size_per_buf;
+        const index_t size_per_issue =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<1>{}, number<0>{}, number<0>{})) *
+                sizeof(LdsDataType) -
+            size_per_buf;
+        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
+        m0_set_with_memory(m0_init_value); // This should be wave independent
+        using vector_t = typename traits::vector_t;
+        LdsDataType* smem = lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_;
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess  = number<i_access_>{};
+            constexpr auto pre_nop_ = [&]() {
+                if constexpr(pre_nop && i_access_ == 0)
+                    return bool_constant<true>{};
+                else
+                    return bool_constant<false>{};
+            }();
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto bottom_tensor_flag         = cached_flags_[IAccess]; // get this flag anyway
+            // read from bottom tensor
+            get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
+                smem, bottom_tensor_thread_coord, 0, bottom_tensor_flag, pre_nop_);
+            // move thread coordinate
+            if constexpr(i_access_ != (NumAccess - 1))
+            {
+                m0_inc_with_memory(size_per_issue);
+            }
+        };
+        WINDOW_DISPATCH_ISSUE();
+    }
+    template <typename LdsTileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto async_load(LdsTileWindow_&& lds_tile,
+                                   number<i_access>                     = {},
+                                   bool_constant<oob_conditional_check> = {}) const
+    {
+        using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
+        using LdsDataType   = typename LdsTileWindow::DataType;
+        // currently we only support everything is non linear dim
+        // actually it's not performant if we have linear dim(e.g. fast changing)
+        static_assert(NumAccess_NonLinear == NumAccess);
+        static_assert(BottomTensorView::buffer_view::get_address_space() ==
+                      address_space_enum::global);
+        // issues * warps * lanes
+        static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+        // TODO: LDS offset is not good for intrinsic based implementation(compiler can't figure out
+        // dependency) hence avoid use offset based solution. size_per_buf should be zero (how to
+        // check?)
+        constexpr index_t size_per_buf =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<0>{}, number<0>{}));
+        constexpr index_t size_per_wave =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<1>{}, number<0>{})) -
+            size_per_buf;
+        constexpr index_t size_per_issue =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<1>{}, number<0>{}, number<0>{})) -
+            size_per_buf;
+        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
+        using vector_t = typename traits::vector_t;
+        // TODO: we force CK_TILE_LDS_ADDR
+        CK_TILE_LDS_ADDR LdsDataType* smem =
+            lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_ + m0_init_value;
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+            // read from bottom tensor
+            get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
+                smem,
+                bottom_tensor_thread_coord,
+                0,
+                bottom_tensor_flag,
+                bool_constant<oob_conditional_check>{});
+            // move thread coordinate
+            if constexpr(i_access_ != (NumAccess - 1))
+            {
+                smem += size_per_issue; // Note we manually increase the per-issue offset
+            }
+        };
+        WINDOW_DISPATCH_ISSUE();
+    }
+    template <index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE void store(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                              number<i_access>                     = {},
+                              bool_constant<oob_conditional_check> = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+        constexpr auto tile_dstr = TileDstr{};
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+            // read from distributed tensor
+            vector_t vec_value;
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                    },
+                    number<NDimY>{});
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                vec_value.template get_as<DataType>()(j) =
+                    dstr_tensor.get_thread_buffer().template at<d>();
+            });
+            // write into bottom tensor
+            get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
+                bottom_tensor_thread_coord,
+                linear_offset,
+                bottom_tensor_flag,
+                vec_value,
+                bool_constant<oob_conditional_check>{});
+        };
+        WINDOW_DISPATCH_ISSUE();
+    }
+    template <index_t i_access = -1>
+    CK_TILE_DEVICE void store_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                                  number<i_access> = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+        constexpr auto tile_dstr                    = TileDstr{};
+        static constexpr bool oob_conditional_check = true;
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+            // read from distributed tensor
+            vector_t vec_value;
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                    },
+                    number<NDimY>{});
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                vec_value.template get_as<DataType>()(j) =
+                    dstr_tensor.get_thread_buffer().template at<d>();
+            });
+            // write into bottom tensor
+            get_bottom_tensor_view()
+                .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
+                    bottom_tensor_thread_coord, linear_offset, bottom_tensor_flag, vec_value);
+        };
+        WINDOW_DISPATCH_ISSUE();
+    }
+    template <index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE void update(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                               number<i_access>                     = {},
+                               bool_constant<oob_conditional_check> = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+        constexpr auto tile_dstr = TileDstr{};
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+            // read from distributed tensor
+            vector_t vec_value;
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                    },
+                    number<NDimY>{});
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                vec_value.template get_as<DataType>()(j) =
+                    dstr_tensor.get_thread_buffer().template at<d>();
+            });
+            // write into bottom tensor
+            get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
+                bottom_tensor_thread_coord,
+                linear_offset,
+                bottom_tensor_flag,
+                vec_value,
+                bool_constant<oob_conditional_check>{});
+        };
+        WINDOW_DISPATCH_ISSUE();
+    }
+    // move thread's botom tensor coordiante
+    // [x0', x1', ... ] ==> [offset]
+    // also move window-origin
+    CK_TILE_DEVICE void move(const BottomTensorIndex& step)
+    {
+        window_origin_ += step;
+        static_for<0, NumAccess, 1>{}([&](auto i_access) {
+            constexpr auto IAccess       = number<i_access>{};
+            constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
+            constexpr auto need_update_non_linear_coord =
+                bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
+            if constexpr(need_update_non_linear_coord)
+            {
+                move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(),
+                                       cached_coords_(non_linear_id),
+                                       step);
+            }
+            // move the current coord with linear_coords
+            auto tmp_coords             = cached_coords_[non_linear_id];
+            constexpr auto linear_coord = get_bottom_linear_coordinate(IAccess);
+            move_tensor_coordinate(
+                bottom_tensor_view_.get_tensor_descriptor(), tmp_coords, linear_coord);
+            cached_flags_(IAccess) = coordinate_has_valid_offset_assuming_top_index_is_valid(
+                bottom_tensor_view_.get_tensor_descriptor(), tmp_coords);
+        });
+    }
+    CK_TILE_DEVICE void set_window_origin(const BottomTensorIndex& new_window_origin)
+    {
+        window_origin_ = new_window_origin;
+        auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
+            TileDstr{}.get_ps_ys_to_xs_adaptor(),
+            container_concat(make_tuple(get_warp_id(), get_lane_id()),
+                             generate_tuple([&](auto) { return number<0>{}; }, number<NDimY>{})));
+        BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
+            window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
+        auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
+            bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
+        // future load/store() calls (might allocate more registers)
+        using SFC_Ys = typename traits::SFC_Ys;
+        static_for<0, NumAccess, 1>{}([&](auto i_access) {
+            constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
+            constexpr auto need_save_non_linear_coord =
+                bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
+            if constexpr(need_save_non_linear_coord)
+            {
+                cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp;
+            }
+            if constexpr(i_access != (NumAccess - 1))
+            {
+                constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number
+                constexpr auto idx_diff_ps_ys = container_concat(
+                    generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                    idx_diff_ys);
+                move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    window_adaptor_thread_coord_tmp,
+                    bottom_tensor_thread_coord_tmp,
+                    idx_diff_ps_ys);
+            }
+        });
+    }
+    CK_TILE_HOST_DEVICE void init_raw() { bottom_tensor_view_.init_raw(); }
+    // this is the bottom tensor view
+    // [x0', x1', ...] ==> [offset]
+    BottomTensorView bottom_tensor_view_;
+    //
+    WindowLengths window_lengths_;
+    // origin ([x0', x1', ...]) of window on bottom tensor
+    BottomTensorIndex window_origin_;
+    // Tile tensor distribution, which contains:
+    //   1. adaptor for window: [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...]
+    //   2. thread descriptor for thread tensor in register: [y0, y1, ...] ==> [d]
+    TileDstr tile_dstr_;
+    // this contains:
+    array<BottomTensorCoord, traits::NumAccess_NonLinear> cached_coords_;
+    array<bool, traits::NumAccess> cached_flags_;
+};
+#undef WINDOW_DISPATCH_ISSUE
+namespace impl {
+template <address_space_enum, index_t len_>
+struct default_linear_bottom_dims_impl
+{
+    using type = typename uniform_sequence_gen<len_, 0>::type;
+};
+template <index_t len_>
+struct default_linear_bottom_dims_impl<address_space_enum::global, len_>
+{
+    // global default to seq<0,0,....1>
+    using type = typename sequence_merge<typename uniform_sequence_gen<len_ - 1, 0>::type,
+                                         sequence<1>>::type;
+};
+template <index_t len_>
+struct default_linear_bottom_dims_impl<address_space_enum::lds, len_>
+{
+    // lds default to seq<1,1.....1>
+    using type = typename uniform_sequence_gen<len_, 1>::type;
+};
+} // namespace impl
+template <typename TensorView_>
+using default_linear_bottom_dims =
+    typename impl::default_linear_bottom_dims_impl<TensorView_::buffer_view::get_address_space(),
+                                                   TensorView_::get_num_of_dimension()>::type;
+// if using this API, will create a tile_window_linear
+// this structure can have the chance to use immediate value, save register
+// need pass in LinearBottomDims_ properly to control which dim is linear
+// so to generate a constexpr offset as linear_offset for this dim
+// (and finally pass to the immediate offset of buffer/lds instruction)
+//
+// Note: there is no internal check for which dim is OK to use linear offset
+// user must make sure by themselves
+//
+// e.g.
+// 2d global matrix, set LinearBottomDims_=seq<0, 1>, the last dim will generate
+// immediate offset if each thread has multiple issue along last dim
+//
+// 2d LDS buffer, set LinearBottomDims_=seq<1, 1>, then only one vgpr used as offset
+// everything else is just using immediate offset.
+//
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
+CK_TILE_DEVICE constexpr auto
+make_tile_window_linear(const TensorView_& tensor_view,
+                        const WindowLengths_& window_lengths,
+                        const multi_index<TensorView_::get_num_of_dimension()>& origin,
+                        const StaticTileDistribution_& tile_distribution,
+                        LinearBottomDims_ = {})
+{
+    static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
+    return tile_window_linear<remove_cvref_t<TensorView_>,
+                              remove_cvref_t<WindowLengths_>,
+                              remove_cvref_t<StaticTileDistribution_>,
+                              remove_cvref_t<LinearBottomDims_>>{
+        tensor_view, window_lengths, origin, tile_distribution};
+}
+template <
+    typename TileWindow_,
+    typename StaticTileDistribution_,
+    typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
+CK_TILE_DEVICE constexpr auto
+make_tile_window_linear(const TileWindow_& tile_window,
+                        const StaticTileDistribution_& tile_distribution,
+                        LinearBottomDims_ = {})
+{
+    return make_tile_window_linear(tile_window.get_bottom_tensor_view(),
+                                   tile_window.get_window_lengths(),
+                                   tile_window.get_window_origin(),
+                                   tile_distribution,
+                                   LinearBottomDims_{});
+}
+// this version must not be called under a constexpr context
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
+CK_TILE_DEVICE auto
+make_tile_window_linear_raw(const TensorView_& tensor_view,
+                            const WindowLengths_& window_lengths,
+                            const multi_index<TensorView_::get_num_of_dimension()>& origin,
+                            const StaticTileDistribution_& tile_distribution,
+                            LinearBottomDims_ = {})
+{
+    static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
+    auto w = tile_window_linear<remove_cvref_t<TensorView_>,
+                                remove_cvref_t<WindowLengths_>,
+                                remove_cvref_t<StaticTileDistribution_>,
+                                remove_cvref_t<LinearBottomDims_>>{
+        tensor_view, window_lengths, origin, tile_distribution};
+    w.init_raw();
+    return w;
+}
+template <
+    typename TileWindow_,
+    typename StaticTileDistribution_,
+    typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
+CK_TILE_DEVICE constexpr auto
+make_tile_window_linear_raw(const TileWindow_& tile_window,
+                            const StaticTileDistribution_& tile_distribution,
+                            LinearBottomDims_ = {})
+{
+    return make_tile_window_linear_raw(tile_window.get_bottom_tensor_view(),
+                                       tile_window.get_window_lengths(),
+                                       tile_window.get_window_origin(),
+                                       tile_distribution,
+                                       LinearBottomDims_{});
+}
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_>
+CK_TILE_DEVICE void move_tile_window(
+    tile_window_linear<TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_>&
+        window,
+    const typename tile_window_linear<TensorView_,
+                                      WindowLengths_,
+                                      StaticTileDistribution_,
+                                      LinearBottomDims_>::BottomTensorIndex& step)
+{
+    window.move(step);
+}
+} // namespace ck_tile
--- a/include/ck_tile/core/utility/functional_with_tuple.hpp
+++ b/include/ck_tile/core/utility/functional_with_tuple.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+// This file should not be included inside tuple.hpp!
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+#include <stdint.h>
+#include <utility>
+namespace ck_tile {
+namespace detail {
+// RemainLengths: sequence<...>
+// Orders: sequence<...>
+template <class RemainLengths, class RamainUnpacks, class Orders>
+struct static_uford_impl
+{
+    CK_TILE_HOST_DEVICE constexpr static_uford_impl()
+    {
+        static_assert(RemainLengths::size() > 0, "wrong! should not get here");
+        static_assert(RamainUnpacks::size() > 0, "wrong! should not get here");
+    }
+    template <class F, class CurrentUnpackIds>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, CurrentUnpackIds) const
+    {
+        constexpr index_t pack_len = RamainUnpacks::front();
+        static_for<0, RemainLengths::front(), pack_len>{}([=](auto I) {
+            constexpr auto new_pack = generate_tuple(
+                [&](auto idx_) {
+                    constexpr auto i_new_pack = number<I + idx_ % pack_len>{};
+                    constexpr auto i_pre_pack = number<idx_ / pack_len>{};
+                    return CurrentUnpackIds{}.at(i_pre_pack).push_back(i_new_pack);
+                },
+                number<CurrentUnpackIds::size() * pack_len>{});
+            static_uford_impl<decltype(RemainLengths::pop_front()),
+                              decltype(RamainUnpacks::pop_front()),
+                              Orders>{}(f, new_pack);
+        });
+    }
+};
+template <class Orders>
+struct static_uford_impl<sequence<>, sequence<>, Orders>
+{
+    template <class F, class PackedId>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, PackedId) const
+    {
+        constexpr auto origin_packs = transform_tuples(
+            [](auto pack_) { return decltype(pack_)::reorder_old_to_new(Orders{}); }, PackedId{});
+        unpack(f, origin_packs);
+    }
+};
+template <class RemainLengths, class RamainUnpacks, class Orders>
+struct static_uford_one_shot_impl
+{
+    template <class F, class CurrentUnpackIds, index_t current_acc>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, CurrentUnpackIds, number<current_acc>) const
+    {
+        constexpr auto r_lens_stride =
+            reverse_exclusive_scan_sequence(RemainLengths{}, multiplies{}, number<1>{});
+        constexpr auto r_upks_stride =
+            reverse_exclusive_scan_sequence(RamainUnpacks{}, multiplies{}, number<1>{});
+        constexpr index_t current_stride = r_lens_stride.front() / r_upks_stride.front();
+        constexpr index_t pack_len       = RamainUnpacks::front();
+        constexpr index_t current_idx    = (current_acc / current_stride) * pack_len;
+        constexpr auto new_pack = generate_tuple(
+            [&](auto idx_) {
+                constexpr auto i_new_pack = number<current_idx + idx_ % pack_len>{};
+                constexpr auto i_pre_pack = number<idx_ / pack_len>{};
+                return CurrentUnpackIds{}.at(i_pre_pack).push_back(i_new_pack);
+            },
+            number<CurrentUnpackIds::size() * pack_len>{});
+        static_uford_one_shot_impl<decltype(RemainLengths::pop_front()),
+                                   decltype(RamainUnpacks::pop_front()),
+                                   Orders>{}(f, new_pack, number<current_acc % current_stride>{});
+    }
+};
+template <class Orders>
+struct static_uford_one_shot_impl<sequence<>, sequence<>, Orders>
+{
+    template <class F, class PackedId, index_t current_acc>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, PackedId, number<current_acc>) const
+    {
+        constexpr auto origin_packs = transform_tuples(
+            [](auto pack_) { return decltype(pack_)::reorder_old_to_new(Orders{}); }, PackedId{});
+        unpack(f, origin_packs);
+    }
+};
+} // namespace detail
+// TODO: we may unify static_ford/static_uford in the future
+//
+// loop over nd space(sequence) with packs
+// you must make sure the function passed in has same number of argument
+//
+// e.g.
+// Lengths=seq<2, 3, 4>, Unpacks=<1, 1, 2>
+// static_uford<Lengths, Unpacks>{}([&](auto i_0, auto i_1){}); // require 2 args(packs)
+//
+// loop #0, i_0=seq<0, 0, 0>, i_1=<0, 0, 1>
+// loop #1, i_0=seq<0, 0, 2>, i_1=<0, 0, 3>
+// loop #2, i_0=seq<0, 1, 0>, i_1=<0, 1, 1>
+// loop #3, i_0=seq<0, 1, 2>, i_1=<0, 1, 3>
+// loop #4, i_0=seq<0, 2, 0>, i_1=<0, 2, 1>
+// loop #5, i_0=seq<0, 2, 2>, i_1=<0, 2, 3>
+// loop #6, i_0=seq<1, 0, 0>, i_1=<1, 0, 1>
+// ...
+template <class Lengths,
+          class Unpacks = typename uniform_sequence_gen<Lengths::size(), 1>::type,
+          class Orders  = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type>
+struct static_uford
+{
+    static constexpr index_t num_packs = reduce_on_sequence(Unpacks{}, multiplies{}, number<1>{});
+    CK_TILE_HOST_DEVICE constexpr static_uford()
+    {
+        static_assert(Lengths::size() > 0, "wrong! Lengths is empty");
+        static_assert(Lengths::size() == Unpacks::size(), "wrong! inconsistent size");
+        static_assert(Lengths::size() == Orders::size(), "wrong! inconsistent size");
+        static_for<0, Lengths::size(), 1>{}(
+            [&](auto i) { static_assert(Lengths{}.at(i) % Unpacks{}.at(i) == 0); });
+    }
+    CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_access()
+    {
+        using L_ = decltype(Lengths{} / Unpacks{});
+        return reduce_on_sequence(L_{}, multiplies{}, number<1>{});
+    }
+    // F signature: F(sequence<...> multi_id...)
+    // multi_id is the unordered multi-index
+    template <class F>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f) const
+    {
+        constexpr auto ordered_lengths = Lengths::reorder_new_to_old(Orders{});
+        constexpr auto ordered_unpacks = Unpacks::reorder_new_to_old(Orders{});
+        detail::static_uford_impl<decltype(ordered_lengths), decltype(ordered_unpacks), Orders>{}(
+            f, make_tuple(sequence<>{}));
+    }
+    // this version is friendly for issue function one by one
+    template <class F, index_t i_access>
+    CK_TILE_HOST_DEVICE constexpr void operator()(F f, number<i_access>) const
+    {
+        static_assert(i_access < get_num_of_access());
+        constexpr auto ordered_lengths = Lengths::reorder_new_to_old(Orders{});
+        constexpr auto ordered_unpacks = Unpacks::reorder_new_to_old(Orders{});
+        detail::static_uford_one_shot_impl<decltype(ordered_lengths),
+                                           decltype(ordered_unpacks),
+                                           Orders>{}(
+            f, make_tuple(sequence<>{}), number<i_access>{});
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/core/utility/literals.hpp
+++ b/include/ck_tile/core/utility/literals.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+namespace ck_tile {
+namespace literals {
+// [P0330] Literal Suffix for (signed) size_t (C++23)
+// ref: https://wg21.link/p0330r8
+inline constexpr std::size_t operator""_uz(unsigned long long size)
+{
+    return static_cast<std::size_t>(size);
+}
+inline constexpr std::size_t operator""_zu(unsigned long long size)
+{
+    return static_cast<std::size_t>(size);
+}
+} // namespace literals
+} // namespace ck_tile
--- a/include/ck_tile/core/utility/magic_div.hpp
+++ b/include/ck_tile/core/utility/magic_div.hpp
@@ -59,8 +59,16 @@ struct magic_division32_bit_range
    CK_TILE_DEVICE static constexpr uint32_t
    do_magic_division(uint32_t dividend, uint32_t multiplier, uint32_t shift)
    {
-        uint32_t tmp = __umulhi(dividend, multiplier);
+        if(__builtin_is_constant_evaluated())
-        return (tmp + dividend) >> shift;
+        {
+            uint32_t tmp = (static_cast<uint64_t>(dividend) * multiplier) >> 32;
+            return (tmp + dividend) >> shift;
+        }
+        else
+        {
+            uint32_t tmp = __umulhi(dividend, multiplier);
+            return (tmp + dividend) >> shift;
+        }
    }
    CK_TILE_HOST static constexpr uint32_t
@@ -77,9 +85,18 @@ struct magic_division32_bit_range
    CK_TILE_DEVICE static constexpr int32_t
    do_magic_division(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
    {
-        uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
+        if(__builtin_is_constant_evaluated())
-        uint32_t tmp          = __umulhi(dividend_u32, multiplier);
+        {
-        return (tmp + dividend_u32) >> shift;
+            uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
+            uint32_t tmp          = (static_cast<uint64_t>(dividend_u32) * multiplier) >> 32;
+            return (tmp + dividend_u32) >> shift;
+        }
+        else
+        {
+            uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
+            uint32_t tmp          = __umulhi(dividend_u32, multiplier);
+            return (tmp + dividend_u32) >> shift;
+        }
    }
    CK_TILE_HOST static constexpr int32_t

--- a/include/ck_tile/core/utility/reduce_operator.hpp
+++ b/include/ck_tile/core/utility/reduce_operator.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core/config.hpp"
+namespace ck_tile {
+namespace ReduceOp {
+// y = ReduceOp(y, x);
+struct Add
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
+    {
+        return type_convert<T>(0.0f);
+    };
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
+    {
+        return y + x;
+    }
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const
+    {
+        float y_ = type_convert<float>(y);
+        float x_ = type_convert<float>(x);
+        return type_convert<T>(y_ + x_);
+    }
+};
+struct SquareAdd
+{
+    template <typename T>
+    CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
+    {
+        return type_convert<T>(0.0f);
+    };
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
+    {
+        return y + (x * x);
+    }
+};
+struct Max
+{
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
+    {
+        return numeric<T>::min();
+    };
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
+    {
+        return max(y, x);
+    }
+};
+struct AbsMax
+{
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
+    {
+        return numeric<T>::min();
+    };
+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
+    {
+        return max(y, abs(x));
+    }
+};
+} // namespace ReduceOp
+} // namespace ck_tile
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -19,10 +19,15 @@
 #include "ck_tile/host/reference/reference_batched_masking.hpp"
 #include "ck_tile/host/reference/reference_batched_rotary_position_embedding.hpp"
 #include "ck_tile/host/reference/reference_batched_softmax.hpp"
+#include "ck_tile/host/reference/reference_elementwise.hpp"
 #include "ck_tile/host/reference/reference_gemm.hpp"
 #include "ck_tile/host/reference/reference_im2col.hpp"
-#include "ck_tile/host/reference/reference_layernorm2d.hpp"
+#include "ck_tile/host/reference/reference_layernorm2d_fwd.hpp"
+#include "ck_tile/host/reference/reference_permute.hpp"
 #include "ck_tile/host/reference/reference_reduce.hpp"
+#include "ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp"
+#include "ck_tile/host/reference/reference_rowwise_quantization2d.hpp"
 #include "ck_tile/host/reference/reference_softmax.hpp"
+#include "ck_tile/host/reference/reference_topk.hpp"
 #include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/timer.hpp"
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -10,6 +10,7 @@
 #include <random>
 #include <type_traits>
 #include <utility>
+#include <unordered_set>
 #include "ck_tile/core.hpp"
@@ -41,6 +42,73 @@ struct FillUniformDistribution
    }
 };
+namespace impl {
+// clang-format off
+template<index_t bytes> struct RawIntegerType_ {};
+template<> struct RawIntegerType_<1> { using type = uint8_t;};
+template<> struct RawIntegerType_<2> { using type = uint16_t;};
+template<> struct RawIntegerType_<4> { using type = uint32_t;};
+template<> struct RawIntegerType_<8> { using type = uint64_t;};
+// clang-format on
+template <typename T>
+using RawIntegerType = typename RawIntegerType_<sizeof(T)>::type;
+} // namespace impl
+// Note: this struct will have no const-ness will generate random
+template <typename T>
+struct FillUniformDistribution_Unique
+{
+    float a_{-5.f};
+    float b_{5.f};
+    std::optional<uint32_t> seed_{11939};
+    std::mt19937 gen_{};
+    std::unordered_set<impl::RawIntegerType<T>> set_{};
+    FillUniformDistribution_Unique(float a                      = -5.f,
+                                   float b                      = 5.f,
+                                   std::optional<uint32_t> seed = {11939})
+        : a_(a),
+          b_(b),
+          seed_(seed),
+          gen_{seed_.has_value() ? *seed_ : std::random_device{}()},
+          set_{}
+    {
+    }
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last)
+    {
+        std::mt19937& gen = gen_;
+        std::uniform_real_distribution<float> dis(a_, b_);
+        auto& set = set_;
+        std::generate(first, last, [&dis, &gen, &set]() {
+            T v = static_cast<T>(0);
+            do
+            {
+                v = ck_tile::type_convert<T>(dis(gen));
+            } while(set.count(bit_cast<impl::RawIntegerType<T>>(v)) == 1);
+            set.insert(bit_cast<impl::RawIntegerType<T>>(v));
+            return v;
+        });
+    }
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range)
+        -> std::void_t<decltype(std::declval<FillUniformDistribution_Unique&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+    void clear() { set_.clear(); }
+};
 template <typename T>
 struct FillNormalDistribution
 {

--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -11,6 +11,7 @@
 #include <thread>
 #include <utility>
 #include <vector>
+#include <functional>
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/ranges.hpp"
@@ -545,6 +546,28 @@ struct HostTensor
    typename Data::size_type size() const { return mData.size(); }
+    // return a slice of this tensor
+    // for simplicity we just copy the data and return a new tensor
+    auto slice(std::vector<size_t> s_begin, std::vector<size_t> s_end) const
+    {
+        assert(s_begin.size() == s_end.size());
+        assert(s_begin.size() == get_num_of_dimension());
+        std::vector<size_t> s_len(s_begin.size());
+        std::transform(
+            s_end.begin(), s_end.end(), s_begin.begin(), s_len.begin(), std::minus<size_t>{});
+        HostTensor<T> sliced_tensor(s_len);
+        sliced_tensor.ForEach([&](auto& self, auto idx) {
+            std::vector<size_t> src_idx(idx.size());
+            std::transform(
+                idx.begin(), idx.end(), s_begin.begin(), src_idx.begin(), std::plus<size_t>{});
+            self(idx) = operator()(src_idx);
+        });
+        return sliced_tensor;
+    }
    template <typename U = T>
    auto AsSpan() const
    {

--- a/include/ck_tile/host/reference/reference_elementwise.hpp
+++ b/include/ck_tile/host/reference/reference_elementwise.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+namespace ck_tile {
+template <typename ADataType, typename BDataType, typename ComputeDataType, typename ElementOp>
+CK_TILE_HOST void reference_unary_elementwise(const HostTensor<ADataType>& a,
+                                              HostTensor<BDataType>& b,
+                                              ElementOp element_op)
+{
+    // TODO: imeplement gpu version reference function
+    auto f = [&](auto i) {
+        auto v_a   = type_convert<ComputeDataType>(a.mData[i]);
+        auto v_b   = element_op(v_a);
+        b.mData[i] = ck_tile::type_convert<BDataType>(v_b);
+    };
+    make_ParallelTensorFunctor(f, b.get_element_space_size())(std::thread::hardware_concurrency());
+}
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ComputeDataType,
+          typename ElementOp>
+CK_TILE_HOST void reference_binary_elementwise(const HostTensor<ADataType>& a,
+                                               const HostTensor<BDataType>& b,
+                                               HostTensor<CDataType>& c,
+                                               ElementOp element_op)
+{
+    // TODO: imeplement gpu version reference function
+    auto f = [&](auto i) {
+        auto v_a   = type_convert<ComputeDataType>(a.mData[i]);
+        auto v_b   = type_convert<ComputeDataType>(b.mData[i]);
+        auto v_c   = element_op(v_a, v_b);
+        c.mData[i] = ck_tile::type_convert<CDataType>(v_c);
+    };
+    make_ParallelTensorFunctor(f, c.get_element_space_size())(std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
+#include <cstdlib>
+#include <thread>
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/host_tensor.hpp"
-#include "ck_tile/ops/common/tensor_layout.hpp"
-#include <thread>
 namespace ck_tile {
@@ -14,55 +15,36 @@ template <typename ADataType,
          typename BDataType,
          typename AccDataType,
          typename CDataType,
-          typename LayoutA,
-          typename LayoutB,
-          typename LayoutC,
          typename AElementOp   = ck_tile::identity,
          typename BElementOp   = ck_tile::identity,
          typename ACCElementOp = ck_tile::identity>
 CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
-                                 const HostTensor<BDataType>& b_n_k,
+                                 const HostTensor<BDataType>& b_k_n,
                                 HostTensor<CDataType>& c_m_n,
                                 const AElementOp& a_element_op     = {},
                                 const BElementOp& b_element_op     = {},
                                 const ACCElementOp& acc_element_op = {})
 {
-    const int N = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
+    const std::size_t M = a_m_k.get_length(0);
-                      ? b_n_k.mDesc.get_lengths()[0]
+    const std::size_t N = b_k_n.get_length(1);
-                      : b_n_k.mDesc.get_lengths()[1];
+    const std::size_t K = a_m_k.get_length(1);
-    const int K = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
-                      ? a_m_k.mDesc.get_lengths()[1]
+    auto f_mn = [&](auto m, auto n) {
-                      : a_m_k.mDesc.get_lengths()[0];
+        AccDataType v_acc = 0;
-    const int M = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
-                      ? a_m_k.mDesc.get_lengths()[0]
+        for(std::size_t k = 0; k < K; ++k)
-                      : a_m_k.mDesc.get_lengths()[1];
-    auto f = [&](auto m) {
-        for(int n = 0; n < N; ++n)
        {
-            AccDataType v_acc = 0;
+            ADataType v_a = a_element_op(a_m_k(m, k));
+            BDataType v_b = b_element_op(b_k_n(k, n));
-            for(int k = 0; k < K; ++k)
-            {
+            v_acc +=
-                ADataType v_a = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
+                ck_tile::type_convert<AccDataType>(v_a) * ck_tile::type_convert<AccDataType>(v_b);
-                                    ? a_element_op(a_m_k(m, k))
-                                    : a_element_op(a_m_k(k, m));
-                BDataType v_b = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
-                                    ? b_element_op(b_n_k(n, k))
-                                    : b_element_op(b_n_k(k, n));
-                v_acc += ck_tile::type_convert<AccDataType>(v_a) *
-                         ck_tile::type_convert<AccDataType>(v_b);
-            }
-            CDataType& c_ref = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
-                                   ? c_m_n(m, n)
-                                   : c_m_n(n, m);
-            c_ref            = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
        }
+        c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
    };
-    make_ParallelTensorFunctor(f, M)(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
 }
 template <typename ADataType,