Merge branch 'gemm_bf16_sk_muozturk' of...

Merge branch 'gemm_bf16_sk_muozturk' of https://github.com/ROCm/composable_kernel into gemm_bf16_sk_muozturk

Merge branch 'gemm_bf16_sk_muozturk' of...
Merge branch 'gemm_bf16_sk_muozturk' of https://github.com/ROCm/composable_kernel into gemm_bf16_sk_muozturk
eca84f93 · root · 6f210155 · c256f018 · eca84f93 · eca84f93
Commit eca84f93 authored Dec 19, 2024 by root
20 changed files
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -11,13 +11,15 @@
 #define __gfx94__
 #endif
 #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__)
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
+    defined(__gfx10_3_generic__)
 #define __gfx103__
 #endif
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
+    defined(__gfx1103__) || defined(__gfx11_generic__)
 #define __gfx11__
 #endif
-#if defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
 #define __gfx12__
 #endif

@@ -32,13 +34,28 @@
 #define CK_TILE_DEVICE inline __device__
 #define CK_TILE_HOST_DEVICE inline __host__ __device__
 #define CK_TILE_DEVICE_EXTERN __device__
+#define CK_TILE_HOST_DEVICE_EXTERN __host__ __device__
 #else
 #define CK_TILE_HOST inline
 #define CK_TILE_DEVICE inline
 #define CK_TILE_HOST_DEVICE inline
 #define CK_TILE_DEVICE_EXTERN
+#define CK_TILE_HOST_DEVICE_EXTERN
 #endif

+// implementing the "memory address space" attribute
+// https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces-table
+#ifdef __HIPCC_
+#define CK_TILE_GENERIC_ADDR __attribute__((address_space(0)))
+#define CK_TILE_GLOBAL_ADDR __attribute__((address_space(1)))
+#define CK_TILE_LDS_ADDR __attribute__((address_space(3)))
+#define CK_TILE_BUF_RES_ADDR __attribute__((address_space(8)))
+#else
+#define CK_TILE_GENERIC_ADDR
+#define CK_TILE_GLOBAL_ADDR
+#define CK_TILE_LDS_ADDR
+#define CK_TILE_BUF_RES_ADDR
+#endif
 #ifndef CK_TILE_USE_CUSTOM_DATA_TYPE
 #define CK_TILE_USE_CUSTOM_DATA_TYPE 0 // custom data type will generate extra move/bfi code
 #endif
@@ -47,6 +64,7 @@
 #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE_WITH_NAN 1
 #define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE 2
 #define CK_TILE_FLOAT_TO_BFLOAT16_STANDARD_ASM 3
+#define CK_TILE_FLOAT_TO_BFLOAT16_RTA_ASM 4

 #ifndef CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT
 #define CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE
@@ -157,8 +175,11 @@
 #endif
 #endif

+// workaround for ROCm 6.2 and later
 #ifndef CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
-#if HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2 && HIP_VERSION_PATCH >= 41133
+#if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2 && HIP_VERSION_PATCH >= 41133) ||  \
+    (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 3 && HIP_VERSION_PATCH >= 42131) || \
+    (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR > 3)
 #define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 1
 #else
 #define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 0
@@ -200,3 +221,12 @@
 #ifndef CK_TILE_BUFFER_LOAD_RAW_BF16_WA
 #define CK_TILE_BUFFER_LOAD_RAW_BF16_WA 1
 #endif
+
+// workaround: compiler not emiting reciprocal instruction frm __frcp_rn()
+#ifndef CK_TILE_WORKAROUND_SWDEV_383542
+#define CK_TILE_WORKAROUND_SWDEV_383542 1
+#endif
+
+#ifndef CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+#define CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID 1
+#endif
--- a/include/ck_tile/core/container/meta_data_buffer.hpp
+++ b/include/ck_tile/core/container/meta_data_buffer.hpp
@@ -30,7 +30,7 @@ struct meta_data_buffer
        {
            constexpr index_t size = sizeof(T);

-            auto tmp = bit_cast<array<std::byte, size>>(data);
+            auto tmp = ck_tile::bit_cast<array<std::byte, size>>(data);

            for(int i = 0; i < size; i++)
            {
@@ -66,7 +66,7 @@ struct meta_data_buffer
                pos++;
            }

-            data = bit_cast<T>(tmp);
+            data = ck_tile::bit_cast<T>(tmp);
        }

        return data;
@@ -86,7 +86,7 @@ struct meta_data_buffer
            pos++;
        }

-        auto data = bit_cast<T>(tmp);
+        auto data = ck_tile::bit_cast<T>(tmp);

        return data;
    }

--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -1111,4 +1111,126 @@ CK_TILE_HOST_DEVICE constexpr auto generate_array(F&& f, number<N>)
                  typename arithmetic_sequence_gen<0, N, 1>::type{});
 }

+namespace impl {
+template <typename, typename, typename, index_t>
+struct reverse_slice_sequence_impl;
+
+template <index_t x,
+          index_t... xs,
+          index_t m,
+          index_t... ms,
+          index_t id,
+          index_t... ids,
+          index_t SliceSize>
+struct reverse_slice_sequence_impl<sequence<x, xs...>,
+                                   sequence<m, ms...>,
+                                   sequence<id, ids...>,
+                                   SliceSize>
+{
+    using old_scan =
+        reverse_slice_sequence_impl<sequence<xs...>, sequence<ms...>, sequence<ids...>, SliceSize>;
+
+    static constexpr auto slice_size = old_scan::remaining_slice_sizes::front().value;
+    static constexpr auto slice_length =
+        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
+
+    using dim_lengths =
+        typename sequence_merge<sequence<slice_length>, typename old_scan::dim_lengths>::type;
+    using dim_slices =
+        typename sequence_merge<sequence<x / slice_length>, typename old_scan::dim_slices>::type;
+    using remaining_slice_sizes = typename sequence_merge<
+        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>,
+        typename old_scan::remaining_slice_sizes>::type;
+
+    // the first idx that sliced length not equal to original length
+    static constexpr index_t _flag =
+        slice_length != x && remaining_slice_sizes{}.front().value == 1;
+    static constexpr index_t _split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
+    static constexpr index_t _split_idx =
+        std::conditional_t<_split_flag, number<id>, number<0>>::value;
+
+    static constexpr index_t split_flag = _split_flag || old_scan::split_flag;
+    static constexpr index_t split_idx  = std::
+        conditional_t<old_scan::split_flag, number<old_scan::split_idx>, number<_split_idx>>::value;
+};
+
+template <index_t x, index_t m, index_t id, index_t SliceSize>
+struct reverse_slice_sequence_impl<sequence<x>, sequence<m>, sequence<id>, SliceSize>
+{
+    static constexpr auto slice_size = SliceSize;
+    static constexpr auto slice_length =
+        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
+
+    using dim_lengths = sequence<slice_length>;
+    using dim_slices  = sequence<x / slice_length>;
+    using remaining_slice_sizes =
+        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>;
+
+    // the first idx that sliced length not equal to original length
+    static constexpr index_t _flag =
+        slice_length != x && remaining_slice_sizes{}.front().value == 1;
+    static constexpr index_t split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
+    static constexpr index_t split_idx =
+        std::conditional_t<split_flag, number<id>, number<0>>::value;
+};
+} // namespace impl
+
+// clang-format off
+// input a sequence(with optional mask), and the SliceSize : size per slice
+// output the sequence each slice, and number of slices
+//
+// e.g. <2, 1, 4, 2>, 8     -> lengths:<1, 1, 4, 2>    , nums: <2, 1, 1, 1>    : 2 slices  , slice_idx: 0
+//      <4, 2, 4, 1, 2>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 1> : 16 slices , slice_idx: 2
+//      <4, 2, 4, 1, 6>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 3> : 48 slices , slice_idx: 2
+//      <4, 2, 5, 1, 2>, 10 -> lengths:<1, 1, 5, 1, 2> , nums: <4, 2, 1, 1, 1> : 8 slices  , slice_idx: 1
+//
+//      <4, 2, 8>, 64       -> lengths:<4, 2, 8>       , nums: <1, 1, 1>       : 1  slices , slice_idx: 0
+//      <4, 2, 8>, 32       -> lengths:<2, 2, 8>       , nums: <2, 1, 1>       : 2  slices , slice_idx: 0
+//      <4, 2, 8>, 16       -> lengths:<1, 2, 8>       , nums: <4, 1, 1>       : 4  slices , slice_idx: 0
+//      <4, 2, 8>, 8        -> lengths:<1, 1, 8>       , nums: <4, 2, 1>       : 8  slices , slice_idx: 1
+//      <4, 2, 8>, 4        -> lengths:<1, 1, 4>       , nums: <4, 2, 2>       : 16 slices , slice_idx: 2
+//      <4, 2, 8>, 2        -> lengths:<1, 1, 2>       , nums: <4, 2, 4>       : 32 slices , slice_idx: 2
+//      <4, 2, 8>, 1        -> lengths:<1, 1, 1>       , nums: <4, 2, 8>       : 64 slices , slice_idx: 2
+//
+//      <4, 2, 1, 4, 2> / 4 ->
+// mask:<1, 1, 1, 0, 1>,    -> lengths:<1, 2, 1, 4, 2> , nums: <4, 1, 1, 1, 1> : 8 slices  , slice_idx: 0
+//
+// return tuple<slice_lengths, slice_nums, slice_index>, slice_index is at which index will start
+// have split slices (right -> left)
+//  or the first index that sliced length is different from the original length
+// clang-format on
+template <typename Seq,
+          index_t SliceSize,
+          typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
+constexpr auto reverse_slice_sequence(Seq,
+                                      number<SliceSize>,
+                                      Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
+{
+    static_assert(Seq::size() == Mask::size());
+    using sliced_type =
+        impl::reverse_slice_sequence_impl<Seq,
+                                          Mask,
+                                          typename arithmetic_sequence_gen<0, Seq::size(), 1>::type,
+                                          SliceSize>;
+    static_assert(sliced_type::remaining_slice_sizes::front().value == 1,
+                  "can not evenly divide this sequence, please check");
+    return make_tuple(typename sliced_type::dim_lengths{},
+                      typename sliced_type::dim_slices{},
+                      number<sliced_type::split_idx>{});
+}
+
+template <typename Seq,
+          index_t SliceSize,
+          typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
+constexpr auto slice_sequence(Seq,
+                              number<SliceSize>,
+                              Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
+{
+    constexpr auto r =
+        reverse_slice_sequence(Seq{}.reverse(), number<SliceSize>{}, Mask{}.reverse());
+    return make_tuple(r[number<0>{}].reverse(),
+                      r[number<1>{}].reverse(),
+                      number<Seq::size() - r[number<2>{}] - 1>{});
+}
+
 } // namespace ck_tile
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -488,6 +488,26 @@ CK_TILE_HOST_DEVICE constexpr auto transform_tuples(F f, const X& x, const Y& y,
        f, x, y, z, typename arithmetic_sequence_gen<0, X::size(), 1>::type{});
 }

+namespace detail {
+
+template <typename F, typename X, index_t... Is>
+CK_TILE_HOST_DEVICE constexpr auto embed_tuples_impl(F f, const X& x, sequence<Is...>)
+{
+    return concat_tuple(f(x.at(number<Is>{}))...);
+}
+
+} // namespace detail
+
+// make sure F return at least a tuple
+// e.g. x : tuple<X, Y>, f will return tuple<Z, W>
+// this function will return
+template <typename F, typename X>
+CK_TILE_HOST_DEVICE constexpr auto embed_tuples(F f, const X& x)
+{
+    return detail::embed_tuples_impl(
+        f, x, typename arithmetic_sequence_gen<0, X::size(), 1>::type{});
+}
+
 // By default unroll to the flatten
 template <index_t Depth = 0, index_t MaxDepth = -1>
 CK_TILE_HOST_DEVICE constexpr auto unroll_nested_tuple(const tuple<>& t)
@@ -603,7 +623,7 @@ template <typename... Ys,
              false>
 CK_TILE_HOST_DEVICE constexpr auto operator+=(tuple<Ys...>& y, const X& x)
 {
-    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    static_assert(X::size() == sizeof...(Ys), "wrong! size not the same");
    constexpr index_t NSize = sizeof...(Ys);
    static_for<0, NSize, 1>{}([&](auto i) { y[i] += x[i]; });
    return y;
@@ -615,7 +635,7 @@ template <typename... Ys,
              false>
 CK_TILE_HOST_DEVICE constexpr auto operator-=(tuple<Ys...>& y, const X& x)
 {
-    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    static_assert(X::size() == sizeof...(Ys), "wrong! size not the same");
    constexpr index_t NSize = sizeof...(Ys);
    static_for<0, NSize, 1>{}([&](auto i) { y[i] -= x[i]; });
    return y;
@@ -627,7 +647,7 @@ template <typename... Xs,
              false>
 CK_TILE_HOST_DEVICE constexpr auto operator+(const tuple<Xs...>& x, const Y& y)
 {
-    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    static_assert(Y::size() == sizeof...(Xs), "wrong! size not the same");
    constexpr index_t NSize = sizeof...(Xs);

    tuple<Xs...> r;
@@ -635,13 +655,21 @@ CK_TILE_HOST_DEVICE constexpr auto operator+(const tuple<Xs...>& x, const Y& y)
    return r;
 }

+template <typename... Xs, typename... Ys>
+CK_TILE_HOST_DEVICE constexpr auto operator+(const tuple<Xs...>& x, const tuple<Ys...>& y)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong!");
+    constexpr index_t NSize = sizeof...(Xs);
+    return generate_tuple([&](auto i) { return x[i] + y[i]; }, number<NSize>{});
+}
+
 template <typename... Xs,
          typename Y,
          std::enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> =
              false>
 CK_TILE_HOST_DEVICE constexpr auto operator-(const tuple<Xs...>& x, const Y& y)
 {
-    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    static_assert(Y::size() == sizeof...(Xs), "wrong! size not the same");
    constexpr index_t NSize = sizeof...(Xs);

    tuple<Xs...> r;
@@ -649,13 +677,21 @@ CK_TILE_HOST_DEVICE constexpr auto operator-(const tuple<Xs...>& x, const Y& y)
    return r;
 }

+template <typename... Xs, typename... Ys>
+CK_TILE_HOST_DEVICE constexpr auto operator-(const tuple<Xs...>& x, const tuple<Ys...>& y)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong!");
+    constexpr index_t NSize = sizeof...(Xs);
+    return generate_tuple([&](auto i) { return x[i] - y[i]; }, number<NSize>{});
+}
+
 template <typename... Xs,
          typename Y,
          std::enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> =
              false>
 CK_TILE_HOST_DEVICE constexpr auto operator*(const tuple<Xs...>& x, const Y& y)
 {
-    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    static_assert(Y::size() == sizeof...(Xs), "wrong! size not the same");
    constexpr index_t NSize = sizeof...(Xs);

    tuple<Xs...> r;
@@ -686,6 +722,14 @@ CK_TILE_HOST_DEVICE constexpr auto operator*(const tuple<Xs...>& x, Y a)
    return a * x;
 }

+template <typename... Xs, typename... Ys>
+CK_TILE_HOST_DEVICE constexpr auto operator*(const tuple<Xs...>& x, const tuple<Ys...>& y)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong!");
+    constexpr index_t NSize = sizeof...(Xs);
+    return generate_tuple([&](auto i) { return x[i] * y[i]; }, number<NSize>{});
+}
+
 template <typename... Xs, typename... Ys>
 CK_TILE_HOST_DEVICE constexpr auto operator/(const tuple<Xs...>& x, const tuple<Ys...>& y)
 {

--- a/include/ck_tile/core/numeric/bfloat16.hpp
+++ b/include/ck_tile/core/numeric/bfloat16.hpp
@@ -18,6 +18,7 @@ enum class bf16_rounding_mode
    truncate_with_nan,
    truncate,
    standard_asm,
+    rta_asm, // round to nearest away
 };

 template <bf16_rounding_mode rounding =
@@ -180,6 +181,39 @@ uint16_t float_to_bf16_rtn_asm(float f)
    return uint16_t(u.int32);
 }

+// TODO: do we need this on host?
+CK_TILE_HOST
+uint16_t float_to_bf16_rta_asm(float f) { return float_to_bf16_rtn_raw(f); }
+
+CK_TILE_DEVICE
+uint16_t float_to_bf16_rta_asm(float f)
+{
+    union
+    {
+        float fp32;
+        struct
+        {
+            uint16_t lo;
+            uint16_t hi;
+        };
+    } u = {f};
+
+    const uint32_t low_nan = 0x7fff;
+    const uint32_t hi_nan  = 0x7fff0000;
+
+    using uint32x2_t = uint32_t __attribute__((ext_vector_type(2)));
+    uint32x2_t check_nan;
+
+    asm volatile("v_cmp_u_f32 %[s_cnan], %[v_x], %[v_x] \n"
+                 "v_add3_u32 %[v_x], %[v_x], %[v_blo], 1 \n"
+                 "v_cndmask_b32 %[v_x], %[v_x], %[v_bhi], %[s_cnan]"
+                 : [s_cnan] "+s"(check_nan), [v_x] "+v"(u.fp32)
+                 : [v_blo] "v"(low_nan), [v_bhi] "v"(hi_nan));
+
+    // Note: in above code snipet, we use hi 16 bit
+    return u.hi;
+}
+
 // Truncate instead of rounding, preserving SNaN
 CK_TILE_HOST_DEVICE
 constexpr uint16_t float_to_bf16_truc_nan_raw(float f)
@@ -213,6 +247,8 @@ CK_TILE_HOST_DEVICE constexpr uint16_t float_to_bf16_raw(float f, constant<round
        return float_to_bf16_rtn_asm(f);
    else if constexpr(rounding == bf16_rounding_mode::truncate_with_nan)
        return float_to_bf16_truc_nan_raw(f);
+    else if constexpr(rounding == bf16_rounding_mode::rta_asm)
+        return float_to_bf16_rta_asm(f);
    else
        return float_to_bf16_truc_raw(f);
 }

--- a/include/ck_tile/core/numeric/int8.hpp
+++ b/include/ck_tile/core/numeric/int8.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/half.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/numeric/numeric.hpp"
+#include "ck_tile/core/utility/bit_cast.hpp"
+#include "ck_tile/core/utility/random.hpp"
+#include <stdint.h>
+#include <type_traits>
+
+#pragma once
+
+namespace ck_tile {
+
+// use int8_t directly for int8 arithemetic
+// here one can use ck_tile::int8_t to access original int8_t
+using int8_t = int8_t;
+
+// limits
+template <class T>
+struct numeric;
+
+template <>
+struct numeric<int8_t>
+{
+    // minimum finite value, or minimum positive normalized value for float
+    CK_TILE_HOST_DEVICE static constexpr int8_t min() { return int8_t(-128); }
+
+    // minumum finite value
+    CK_TILE_HOST_DEVICE static constexpr int8_t lowest() { return int8_t(-128); }
+
+    // maximum finite value
+    CK_TILE_HOST_DEVICE static constexpr int8_t max() { return int8_t(127); }
+
+    // difference between 1.0 and next value representable by float
+    CK_TILE_HOST_DEVICE static constexpr int8_t epsilon()
+    {
+        return 1; // not used
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr int8_t round_error()
+    {
+        return 1; // not used
+    }
+
+    // positive infinity value
+    CK_TILE_HOST_DEVICE static constexpr int8_t infinity()
+    {
+        return 1; // not used
+    }
+
+    // quiet NaN
+    CK_TILE_HOST_DEVICE static constexpr int8_t quiet_NaN()
+    {
+        return 1; // not used
+    }
+
+    // signaling NaN
+    CK_TILE_HOST_DEVICE static constexpr int8_t signaling_NaN()
+    {
+        return 1; // not used
+    }
+
+    // smallest positive subnormal value
+    CK_TILE_HOST_DEVICE static constexpr int8_t denorm_min()
+    {
+        return 1; // not used
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr int8_t zero() { return 0; }
+};
+
+#if 0
+template <typename T>
+struct numeric_traits;
+
+template <>
+struct numeric_traits<int8_t>
+{
+    static constexpr int exp            = 5;
+    static constexpr int mant           = 10;
+    static constexpr int bias           = 15;
+    static constexpr uint16_t nan_mask  = 0x7C00;
+    static constexpr uint16_t head_mask = 0xFC00;
+    static constexpr uint16_t mant_mask = 0x3FF;
+    static constexpr uint16_t exp_mask  = 0x1F;
+    static constexpr uint32_t Inf       = 0x7C00;
+    static constexpr uint32_t NegInf    = 0xFC00;
+    static constexpr uint32_t NaN       = 0x7C01;
+    static constexpr uint32_t Neg0      = 0x8000;
+    using bitwise_type                  = uint16_t;
+};
+#endif
+
+CK_TILE_HOST_DEVICE
+constexpr float int8_to_float(const int8_t& x) { return static_cast<float>(x); }
+
+CK_TILE_HOST_DEVICE
+constexpr int8_t float_to_int8(const float& x) { return static_cast<int8_t>(x); }
+
+} // namespace ck_tile
--- a/include/ck_tile/core/numeric/math.hpp
+++ b/include/ck_tile/core/numeric/math.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -487,55 +487,12 @@ struct log2e<float>
 template <typename T = double>
 constexpr T log2e_v = log2e<T>::value;

-// math
-CK_TILE_HOST_DEVICE
-float abs(const float& x)
-{
-    union
-    {
-        float f32;
-        uint32_t u32;
-    } y;
-    y.f32 = x;
-    y.u32 = y.u32 & 0x7fffffff;
-    return y.f32;
-}
-
-CK_TILE_HOST_DEVICE
-bool isnan(const float& x)
-{
-    uint32_t xx = bit_cast<uint32_t>(x);
-    return (xx & 0x7fffffff) > 0x7F800000;
-}
-
-CK_TILE_HOST float sqrt(float x) { return std::sqrt(x); };
-
-CK_TILE_HOST double sqrt(double x) { return std::sqrt(x); };
-
-CK_TILE_DEVICE
-float sqrt(float x) { return __builtin_amdgcn_sqrtf(x); };
-
-CK_TILE_DEVICE
-double sqrt(double x) { return __builtin_amdgcn_sqrt(x); };
-
-CK_TILE_DEVICE
-float exp(float x) { return __ocml_exp_f32(x); };
-
-CK_TILE_HOST
-float exp(float x) { return std::expf(x); }
-
 CK_TILE_DEVICE
 float exp2(float x) { return exp2f(x); };

 CK_TILE_HOST
 float exp2(float x) { return std::exp2f(x); };

-CK_TILE_DEVICE
-float log(float x) { return __logf(x); };
-
-CK_TILE_HOST
-float log(float x) { return std::logf(x); };
-
 CK_TILE_DEVICE uint16_t sad_u16(uint16_t x, uint16_t y, uint16_t acc)
 {
    return __builtin_amdgcn_sad_u16(x, y, acc);
@@ -554,4 +511,933 @@ CK_TILE_HOST uint32_t sad_u32(uint32_t x, uint32_t y, uint32_t acc)
    return (x > y ? (x - y) : (y - x)) + acc;
 }

+///////////////////////////////////////////////////////////////
+
+} // namespace ck_tile
+// blow function need data type pre-defined
+#include "ck_tile/core/numeric/half.hpp"
+#include "ck_tile/core/numeric/bfloat16.hpp"
+#include "ck_tile/core/numeric/float8.hpp"
+#include "ck_tile/core/numeric/type_convert.hpp"
+#ifndef __HIP_DEVICE_COMPILE__
+#include <cmath>
+#endif
+
+namespace ck_tile {
+#if CK_TILE_WORKAROUND_SWDEV_383542
+extern "C" CK_TILE_DEVICE float __ocml_native_recip_f32(float);
+#endif
+
+// math functions for the host,  some are implemented by calling C++ std functions
+
+CK_TILE_HOST float abs(float x) { return std::abs(x); };
+
+CK_TILE_HOST double abs(double x) { return std::abs(x); };
+
+CK_TILE_HOST int8_t abs(int8_t x)
+{
+    int8_t sgn = x >> (8 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+CK_TILE_HOST int32_t abs(int32_t x)
+{
+    int32_t sgn = x >> (32 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+CK_TILE_HOST fp16_t abs(fp16_t x)
+{
+    uint16_t xx = bit_cast<uint16_t>(x);
+
+    uint16_t abs_xx = xx & 0x7fff;
+
+    fp16_t abs_x = bit_cast<fp16_t>(abs_xx);
+
+    return abs_x;
+};
+
+#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+CK_TILE_HOST int4_t abs(int4_t x)
+{
+    int4_t sgn = x >> (4 - 1);
+    return (x ^ sgn) - sgn;
+}
+#endif
+
+CK_TILE_HOST bool isnan(float x) { return std::isnan(x); };
+
+CK_TILE_HOST bool isnan(double x) { return std::isnan(x); };
+
+CK_TILE_HOST bool isnan(int8_t x)
+{
+    (void)x;
+    return false;
+};
+
+CK_TILE_HOST bool isnan(int32_t x)
+{
+    (void)x;
+    return false;
+};
+
+CK_TILE_HOST bool isnan(fp16_t x)
+{
+    uint16_t xx = bit_cast<uint16_t>(x);
+
+    return (xx & 0x7FFF) > 0x7C00;
+};
+
+#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+CK_TILE_HOST bool isnan(int4_t x)
+{
+    (void)x;
+    return false;
+};
+#endif
+
+CK_TILE_HOST fp16_t sqrt(fp16_t x)
+{
+    return static_cast<fp16_t>(std::sqrt(static_cast<float>(x)));
+};
+
+CK_TILE_HOST float sqrt(float x) { return std::sqrt(x); };
+
+CK_TILE_HOST double sqrt(double x) { return std::sqrt(x); };
+
+template <typename T>
+CK_TILE_HOST T tanh(T x)
+{
+    return type_convert<T>(std::tanhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float tanh<float>(float x)
+{
+    return std::tanhf(x);
+};
+
+template <>
+CK_TILE_HOST double tanh<double>(double x)
+{
+    return std::tanh(x);
+};
+
+template <typename T>
+CK_TILE_HOST T acos(T x)
+{
+    return type_convert<T>(std::acosf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float acos<float>(float x)
+{
+    return std::acosf(x);
+};
+
+template <>
+CK_TILE_HOST double acos<double>(double x)
+{
+    return std::acos(x);
+};
+
+template <typename T>
+CK_TILE_HOST T neg(T x)
+{
+    return type_convert<T>(-(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float neg<float>(float x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_HOST double neg<double>(double x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_HOST int32_t neg<int32_t>(int32_t x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_HOST int8_t neg<int8_t>(int8_t x)
+{
+    return -x;
+};
+
+template <typename T>
+CK_TILE_HOST T atan(T x)
+{
+    return type_convert<T>(std::atanf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float atan<float>(float x)
+{
+    return std::atanf(x);
+};
+
+template <>
+CK_TILE_HOST double atan<double>(double x)
+{
+    return std::atan(x);
+};
+
+template <typename T>
+CK_TILE_HOST T sin(T x)
+{
+    return type_convert<T>(std::sinf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float sin<float>(float x)
+{
+    return std::sinf(x);
+};
+
+template <>
+CK_TILE_HOST double sin<double>(double x)
+{
+    return std::sin(x);
+};
+
+template <typename T>
+CK_TILE_HOST T asin(T x)
+{
+    return type_convert<T>(std::asinf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float asin<float>(float x)
+{
+    return std::asinf(x);
+};
+
+template <>
+CK_TILE_HOST double asin<double>(double x)
+{
+    return std::asin(x);
+};
+
+template <typename T>
+CK_TILE_HOST T asinh(T x)
+{
+    return type_convert<T>(std::asinhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float asinh<float>(float x)
+{
+    return std::asinhf(x);
+};
+
+template <>
+CK_TILE_HOST double asinh<double>(double x)
+{
+    return std::asinh(x);
+};
+
+template <typename T>
+CK_TILE_HOST T cos(T x)
+{
+    return type_convert<T>(std::cosf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float cos<float>(float x)
+{
+    return std::cosf(x);
+};
+
+template <>
+CK_TILE_HOST double cos<double>(double x)
+{
+    return std::cos(x);
+};
+
+template <typename T>
+CK_TILE_HOST T acosh(T x)
+{
+    return type_convert<T>(std::acoshf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float acosh<float>(float x)
+{
+    return std::acoshf(x);
+};
+
+template <>
+CK_TILE_HOST double acosh<double>(double x)
+{
+    return std::acosh(x);
+};
+
+template <typename T>
+CK_TILE_HOST T tan(T x)
+{
+    return type_convert<T>(std::tanf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float tan<float>(float x)
+{
+    return std::tanf(x);
+};
+
+template <>
+CK_TILE_HOST double tan<double>(double x)
+{
+    return std::tan(x);
+};
+
+template <typename T>
+CK_TILE_HOST T atanh(T x)
+{
+    return type_convert<T>(std::atanhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float atanh<float>(float x)
+{
+    return std::atanhf(x);
+};
+
+template <>
+CK_TILE_HOST double atanh<double>(double x)
+{
+    return std::atanh(x);
+};
+
+template <typename T>
+CK_TILE_HOST T sinh(T x)
+{
+    return type_convert<T>(std::sinhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float sinh<float>(float x)
+{
+    return std::sinhf(x);
+};
+
+template <>
+CK_TILE_HOST double sinh<double>(double x)
+{
+    return std::sinh(x);
+};
+
+template <typename T>
+CK_TILE_HOST T ceil(T x)
+{
+    return type_convert<T>(std::ceilf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float ceil<float>(float x)
+{
+    return std::ceilf(x);
+};
+
+template <>
+CK_TILE_HOST double ceil<double>(double x)
+{
+    return std::ceil(x);
+};
+
+template <typename T>
+CK_TILE_HOST T cosh(T x)
+{
+    return type_convert<T>(std::coshf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float cosh<float>(float x)
+{
+    return std::coshf(x);
+};
+
+template <>
+CK_TILE_HOST double cosh<double>(double x)
+{
+    return std::cosh(x);
+};
+
+template <typename T>
+CK_TILE_HOST T floor(T x)
+{
+    return type_convert<T>(std::floorf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_HOST float floor<float>(float x)
+{
+    return std::floorf(x);
+};
+
+template <>
+CK_TILE_HOST double floor<double>(double x)
+{
+    return std::floor(x);
+};
+
+template <typename T>
+CK_TILE_HOST T rcp(T x)
+{
+    return type_convert<T>(1.f / type_convert<float>(x));
+};
+
+template <typename T>
+CK_TILE_HOST T exp(T x)
+{
+    return type_convert<T>(std::expf(type_convert<float>(x)));
+}
+
+template <>
+CK_TILE_HOST float exp<float>(float x)
+{
+    return std::expf(x);
+}
+
+template <>
+CK_TILE_HOST double exp<double>(double x)
+{
+    return std::exp(x);
+}
+
+template <typename T>
+CK_TILE_HOST T log(T x)
+{
+    return type_convert<T>(std::logf(type_convert<float>(x)));
+}
+
+template <>
+CK_TILE_HOST float log<float>(float x)
+{
+    return std::logf(x);
+}
+
+template <>
+CK_TILE_HOST double log<double>(double x)
+{
+    return std::log(x);
+}
+
+template <typename T>
+CK_TILE_HOST T pow(T x, T gamma)
+{
+    return type_convert<T>(std::powf(type_convert<float>(x), type_convert<float>(gamma)));
+}
+
+template <>
+CK_TILE_HOST float pow<float>(float x, float gamma)
+{
+    return std::powf(x, gamma);
+}
+
+template <>
+CK_TILE_HOST double pow<double>(double x, double gamma)
+{
+    return std::pow(x, gamma);
+}
+
+template <typename T>
+CK_TILE_HOST T expm1(T x)
+{
+    return type_convert<T>(std::expm1f(type_convert<float>(x)));
+}
+
+template <>
+CK_TILE_HOST float expm1<float>(float x)
+{
+    return std::expm1f(x);
+}
+
+template <>
+CK_TILE_HOST double expm1<double>(double x)
+{
+    return std::expm1(x);
+}
+
+// math functions for the HIP kernel,  some are implemented by calling hip builtin functions
+
+CK_TILE_DEVICE float abs(float x)
+{
+    union
+    {
+        float f32;
+        uint32_t u32;
+    } y;
+    y.f32 = x;
+    y.u32 = y.u32 & 0x7fffffff;
+    return y.f32;
+};
+
+CK_TILE_DEVICE double abs(double x) { return ::abs(x); };
+
+CK_TILE_DEVICE int8_t abs(int8_t x)
+{
+    int8_t sgn = x >> (8 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+CK_TILE_DEVICE int32_t abs(int32_t x)
+{
+    int32_t sgn = x >> (32 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+CK_TILE_DEVICE int4_t abs(int4_t x)
+{
+    int4_t sgn = x >> (4 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+#endif
+
+CK_TILE_DEVICE fp16_t abs(fp16_t x)
+{
+    uint16_t xx = bit_cast<uint16_t>(x);
+
+    uint16_t abs_xx = xx & 0x7fff;
+
+    fp16_t abs_x = bit_cast<fp16_t>(abs_xx);
+
+    return abs_x;
+};
+
+CK_TILE_DEVICE bool isnan(float x) { return ::isnan(x); };
+
+CK_TILE_DEVICE bool isnan(double x) { return ::isnan(x); };
+
+CK_TILE_DEVICE bool isnan(int8_t x)
+{
+    (void)x;
+    return false;
+};
+
+CK_TILE_DEVICE bool isnan(int32_t x)
+{
+    (void)x;
+    return false;
+};
+
+#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+CK_TILE_DEVICE bool isnan(int4_t x)
+{
+    (void)x;
+    return false;
+};
+#endif
+
+CK_TILE_DEVICE bool isnan(fp16_t x)
+{
+    uint16_t xx = bit_cast<uint16_t>(x);
+
+    return (xx & 0x7FFF) > 0x7C00;
+};
+
+CK_TILE_DEVICE fp16_t sqrt(fp16_t x)
+{
+    return static_cast<fp16_t>(__builtin_amdgcn_sqrtf(static_cast<float>(x)));
+};
+
+CK_TILE_DEVICE float sqrt(float x) { return __builtin_amdgcn_sqrtf(x); };
+
+CK_TILE_DEVICE double sqrt(double x) { return __builtin_amdgcn_sqrt(x); };
+
+template <typename T>
+CK_TILE_DEVICE T tanh(T x)
+{
+    return type_convert<T>(::tanhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float tanh<float>(float x)
+{
+    return ::tanhf(x);
+};
+
+template <>
+CK_TILE_DEVICE double tanh<double>(double x)
+{
+    return ::tanh(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T acos(T x)
+{
+    return type_convert<T>(::acosf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float acos<float>(float x)
+{
+    return ::acosf(x);
+};
+
+template <>
+CK_TILE_DEVICE double acos<double>(double x)
+{
+    return ::acos(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T neg(T x)
+{
+    return type_convert<T>(-(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float neg<float>(float x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_DEVICE double neg<double>(double x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_DEVICE int32_t neg<int32_t>(int32_t x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_DEVICE int8_t neg<int8_t>(int8_t x)
+{
+    return -x;
+};
+
+template <>
+CK_TILE_DEVICE fp16_t neg<fp16_t>(fp16_t x)
+{
+    return -x;
+};
+
+template <typename T>
+CK_TILE_DEVICE T atan(T x)
+{
+    return type_convert<T>(::atanf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float atan<float>(float x)
+{
+    return ::atanf(x);
+};
+
+template <>
+CK_TILE_DEVICE double atan<double>(double x)
+{
+    return ::atan(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T sin(T x)
+{
+    return type_convert<T>(::sinf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float sin<float>(float x)
+{
+    return ::sinf(x);
+};
+
+template <>
+CK_TILE_DEVICE double sin<double>(double x)
+{
+    return ::sin(x);
+};
+
+template <>
+CK_TILE_DEVICE fp16_t sin<fp16_t>(fp16_t x)
+{
+    return __ocml_sin_f16(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T asin(T x)
+{
+    return type_convert<T>(::asinf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float asin<float>(float x)
+{
+    return ::asinf(x);
+};
+
+template <>
+CK_TILE_DEVICE double asin<double>(double x)
+{
+    return ::asin(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T asinh(T x)
+{
+    return type_convert<T>(::asinhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float asinh<float>(float x)
+{
+    return ::asinhf(x);
+};
+
+template <>
+CK_TILE_DEVICE double asinh<double>(double x)
+{
+    return ::asinh(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T acosh(T x)
+{
+    return type_convert<T>(::acoshf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float acosh<float>(float x)
+{
+    return ::acoshf(x);
+};
+
+template <>
+CK_TILE_DEVICE double acosh<double>(double x)
+{
+    return ::acosh(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T tan(T x)
+{
+    return type_convert<T>(::tanf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float tan<float>(float x)
+{
+    return ::tanf(x);
+};
+
+template <>
+CK_TILE_DEVICE double tan<double>(double x)
+{
+    return ::tan(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T atanh(T x)
+{
+    return type_convert<T>(::atanhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float atanh<float>(float x)
+{
+    return ::atanhf(x);
+};
+
+template <>
+CK_TILE_DEVICE double atanh<double>(double x)
+{
+    return ::atanh(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T sinh(T x)
+{
+    return type_convert<T>(::sinhf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float sinh<float>(float x)
+{
+    return ::sinhf(x);
+};
+
+template <>
+CK_TILE_DEVICE double sinh<double>(double x)
+{
+    return ::sinh(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T ceil(T x)
+{
+    return type_convert<T>(::ceilf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float ceil<float>(float x)
+{
+    return ::ceilf(x);
+};
+
+template <>
+CK_TILE_DEVICE double ceil<double>(double x)
+{
+    return ::ceil(x);
+};
+
+template <>
+CK_TILE_DEVICE fp16_t ceil<fp16_t>(fp16_t x)
+{
+    return __ocml_ceil_f16(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T cosh(T x)
+{
+    return type_convert<T>(::coshf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float cosh<float>(float x)
+{
+    return ::coshf(x);
+};
+
+template <>
+CK_TILE_DEVICE double cosh<double>(double x)
+{
+    return ::cosh(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T floor(T x)
+{
+    return type_convert<T>(::floorf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float floor<float>(float x)
+{
+    return ::floorf(x);
+};
+
+template <>
+CK_TILE_DEVICE double floor<double>(double x)
+{
+    return ::floor(x);
+};
+
+template <>
+CK_TILE_DEVICE fp16_t floor<fp16_t>(fp16_t x)
+{
+    return __ocml_floor_f16(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T rcp(T x)
+{
+#if !CK_TILE_WORKAROUND_SWDEV_383542
+    return __frcp_rn(x);
+#else
+    // return __ocml_native_recip_f32(x);
+    return __builtin_amdgcn_rcpf(x);
+#endif
+};
+
+template <typename T>
+CK_TILE_DEVICE T exp(T x)
+{
+    return type_convert<T>(__ocml_exp_f32(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE fp16_t exp<fp16_t>(fp16_t x)
+{
+    return __ocml_exp_f16(x);
+};
+
+template <>
+CK_TILE_DEVICE float exp<float>(float x)
+{
+    return __ocml_exp_f32(x);
+};
+
+template <>
+CK_TILE_DEVICE double exp<double>(double x)
+{
+    return exp(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T log(T x)
+{
+    return type_convert<T>(__logf(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE fp16_t log<fp16_t>(fp16_t x)
+{
+    return __ocml_log_f16(x);
+};
+
+template <>
+CK_TILE_DEVICE float log<float>(float x)
+{
+    return __logf(x);
+};
+
+template <>
+CK_TILE_DEVICE double log<double>(double x)
+{
+    return log(x);
+};
+
+template <typename T>
+CK_TILE_DEVICE T pow(T x, T gamma)
+{
+    return type_convert<T>(powf(type_convert<float>(x), type_convert<float>(gamma)));
+};
+
+template <>
+CK_TILE_DEVICE float pow<float>(float x, float gamma)
+{
+    return powf(x, gamma);
+};
+
+template <>
+CK_TILE_DEVICE double pow<double>(double x, double gamma)
+{
+    return pow(x, gamma);
+};
+
+template <typename T>
+CK_TILE_DEVICE T expm1(T x)
+{
+    return type_convert<T>(expm1f(type_convert<float>(x)));
+};
+
+template <>
+CK_TILE_DEVICE float expm1<float>(float x)
+{
+    return expm1f(x);
+};
+
+template <>
+CK_TILE_DEVICE double expm1<double>(double x)
+{
+    return expm1(x);
+};
+
 } // namespace ck_tile
--- a/include/ck_tile/core/numeric/type_convert.hpp
+++ b/include/ck_tile/core/numeric/type_convert.hpp
@@ -10,6 +10,7 @@
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/bfloat16.hpp"
 #include "ck_tile/core/numeric/float8.hpp"
+#include "ck_tile/core/numeric/int8.hpp"

 namespace ck_tile {

@@ -60,6 +61,9 @@ CK_TILE_TYPE_CONVERT(bf16_t, bf16, float, float)
 CK_TILE_TYPE_CONVERT(fp8_t, fp8, float, float)
 CK_TILE_TYPE_CONVERT(bf8_t, bf8, float, float)

+CK_TILE_TYPE_CONVERT(float, float, int8_t, int8)
+CK_TILE_TYPE_CONVERT(int8_t, int8, float, float)
+
 #undef CK_TILE_TYPE_CONVERT
 #endif


--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -91,8 +91,10 @@ struct buffer_view<address_space_enum::generic,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
-    get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
+    CK_TILE_DEVICE constexpr auto get(index_t i,
+                                      index_t linear_offset,
+                                      bool is_valid_element,
+                                      bool_constant<oob_conditional_check> = {}) const
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -107,11 +109,11 @@ struct buffer_view<address_space_enum::generic,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
            X tmp;

-            __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+            __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));

            return tmp;
 #else
-            return *c_style_pointer_cast<const X*>(&p_data_[i]);
+            return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
 #endif
        }
        else
@@ -134,17 +136,17 @@ struct buffer_view<address_space_enum::generic,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        if constexpr(Op == memory_operation_enum::set)
        {
-            this->template set<X>(i, is_valid_element, x);
+            this->template set<X>(i, linear_offset, is_valid_element, x);
        }
        // FIXME: remove memory_operation_enum::add
        else if constexpr(Op == memory_operation_enum::add)
        {
-            auto tmp = this->template get<X>(i, is_valid_element);
-            this->template set<X>(i, is_valid_element, x + tmp);
+            auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
+            this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
        }
    }

@@ -154,7 +156,7 @@ struct buffer_view<address_space_enum::generic,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -169,9 +171,9 @@ struct buffer_view<address_space_enum::generic,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
            X tmp = x;

-            __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+            __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
 #else
-            *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+            *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
 #endif
        }
    }
@@ -276,8 +278,10 @@ struct buffer_view<address_space_enum::global,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
-    get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
+    CK_TILE_DEVICE constexpr auto get(index_t i,
+                                      index_t linear_offset,
+                                      bool is_valid_element,
+                                      bool_constant<oob_conditional_check> = {}) const
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -303,7 +307,7 @@ struct buffer_view<address_space_enum::global,
                                                                   t_per_x,
                                                                   Coherence,
                                                                   oob_conditional_check>(
-                    p_data_, i, is_valid_element, buffer_size_);
+                    p_data_, i + linear_offset, is_valid_element, buffer_size_);
            }
            else
            {
@@ -311,8 +315,11 @@ struct buffer_view<address_space_enum::global,
                    remove_cvref_t<T>,
                    t_per_x,
                    Coherence,
-                    oob_conditional_check>(
-                    p_data_, i, is_valid_element, buffer_size_, invalid_element_value_);
+                    oob_conditional_check>(p_data_,
+                                           i + linear_offset,
+                                           is_valid_element,
+                                           buffer_size_,
+                                           invalid_element_value_);
            }
        }
        else
@@ -322,11 +329,11 @@ struct buffer_view<address_space_enum::global,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
                X tmp;

-                __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+                __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));

                return tmp;
 #else
-                return *c_style_pointer_cast<const X*>(&p_data_[i]);
+                return *c_style_pointer_cast<const X*>(&p_data_[i + linear_offset]);
 #endif
            }
            else
@@ -352,7 +359,8 @@ struct buffer_view<address_space_enum::global,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
    CK_TILE_DEVICE constexpr auto get_raw(remove_cvref_t<X>& dst,
-                                          index_t i,
+                                          index_t v_offset,
+                                          index_t i_offset,
                                          bool is_valid_element,
                                          bool_constant<pre_nop> = {}) const
    {
@@ -366,7 +374,38 @@ struct buffer_view<address_space_enum::global,
        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;

        amd_buffer_load_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check, pre_nop>(
-            dst, cached_buf_res_, i, is_valid_element, bool_constant<pre_nop>{});
+            dst, cached_buf_res_, v_offset, i_offset, is_valid_element, bool_constant<pre_nop>{});
+    }
+
+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE constexpr auto async_get(CK_TILE_LDS_ADDR remove_cvref_t<T>* smem,
+                                            index_t i,
+                                            index_t linear_offset,
+                                            bool is_valid_element,
+                                            bool_constant<oob_conditional_check> = {}) const
+    {
+        // X is vector of T
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+        amd_async_buffer_load_with_oob<remove_cvref_t<T>, t_per_x, Coherence>(
+            smem,
+            cached_buf_res_,
+            i,
+            linear_offset,
+            is_valid_element,
+            bool_constant<oob_conditional_check>{});
    }

    // i is offset of T, not X. i should be aligned to X
@@ -378,6 +417,7 @@ struct buffer_view<address_space_enum::global,
                  bool>::type = false>
    CK_TILE_DEVICE constexpr auto async_get_raw(remove_cvref_t<T>* smem,
                                                index_t i,
+                                                index_t linear_offset,
                                                bool /*is_valid_element*/,
                                                bool_constant<pre_nop> = {}) const
    {
@@ -391,40 +431,80 @@ struct buffer_view<address_space_enum::global,
        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;

        amd_async_buffer_load_with_oob_raw<remove_cvref_t<T>, t_per_x, Coherence>(
-            smem, cached_buf_res_, i, bool_constant<pre_nop>{});
+            smem, cached_buf_res_, i, linear_offset, bool_constant<pre_nop>{});
    }

    // i is offset of T, not X. i should be aligned to X
    template <memory_operation_enum Op,
              typename X,
+              bool oob_conditional_check = true,
              typename std::enable_if<
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i,
+                               index_t linear_offset,
+                               bool is_valid_element,
+                               const X& x,
+                               bool_constant<oob_conditional_check> = {})
    {
        if constexpr(Op == memory_operation_enum::set)
        {
-            this->template set<X>(i, is_valid_element, x);
+            this->template set<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
        }
        else if constexpr(Op == memory_operation_enum::atomic_add)
        {
-            this->template atomic_add<X>(i, is_valid_element, x);
+            this->template atomic_add<X, oob_conditional_check>(
+                i, linear_offset, is_valid_element, x);
        }
        else if constexpr(Op == memory_operation_enum::atomic_max)
        {
-            this->template atomic_max<X>(i, is_valid_element, x);
+            this->template atomic_max<X, oob_conditional_check>(
+                i, linear_offset, is_valid_element, x);
        }
        // FIXME: remove memory_operation_enum::add
        else if constexpr(Op == memory_operation_enum::add)
        {
-            auto tmp = this->template get<X>(i, is_valid_element);
-            this->template set<X>(i, is_valid_element, x + tmp);
+            auto tmp =
+                this->template get<X, oob_conditional_check>(i, linear_offset, is_valid_element);
+            this->template set<X, oob_conditional_check>(
+                i, linear_offset, is_valid_element, x + tmp);
            // tmp += x;
            // this->template set<X>(i, is_valid_element, tmp);
        }
    }

+    // i is offset of T, not X. i should be aligned to X
+    template <memory_operation_enum Op,
+              typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE void update_raw(index_t i,
+                                   index_t linear_offset,
+                                   bool is_valid_element,
+                                   const X& x,
+                                   bool_constant<oob_conditional_check> = {},
+                                   bool_constant<pre_nop>               = {})
+    {
+        if constexpr(Op == memory_operation_enum::set)
+        {
+            this->template set_raw<X, oob_conditional_check>(i, linear_offset, is_valid_element, x);
+        }
+        else if constexpr(Op == memory_operation_enum::atomic_add)
+        {
+            this->template atomic_add_raw<X, oob_conditional_check, pre_nop>(
+                i, linear_offset, is_valid_element, x);
+        }
+        else if constexpr(Op == memory_operation_enum::atomic_max)
+        {
+            // this->template atomic_max_raw<X>(i, linear_offset, is_valid_element, x);
+        }
+    }
+
    // i is offset of T, not X. i should be aligned to X
    template <typename X,
              bool oob_conditional_check = true,
@@ -432,7 +512,7 @@ struct buffer_view<address_space_enum::global,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -453,7 +533,7 @@ struct buffer_view<address_space_enum::global,
            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;

            amd_buffer_store<remove_cvref_t<T>, t_per_x, Coherence>(
-                x, p_data_, i, is_valid_element, buffer_size_);
+                x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
        }
        else
        {
@@ -462,9 +542,9 @@ struct buffer_view<address_space_enum::global,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
                X tmp = x;

-                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+                __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
 #else
-                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+                *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
 #endif
            }
        }
@@ -477,7 +557,7 @@ struct buffer_view<address_space_enum::global,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void set_raw(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set_raw(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -489,15 +569,17 @@ struct buffer_view<address_space_enum::global,

        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
        amd_buffer_store_raw<remove_cvref_t<T>, t_per_x, Coherence, oob_conditional_check>(
-            x, p_data_, i, is_valid_element, buffer_size_);
+            x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
    }

    template <typename X,
+              bool oob_conditional_check = true,
              typename std::enable_if<
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void atomic_add(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void
+    atomic_add(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;

@@ -532,23 +614,57 @@ struct buffer_view<address_space_enum::global,
        if constexpr(use_amd_buffer_addressing)
        {
            amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
-                x, p_data_, i, is_valid_element, buffer_size_);
+                x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
        }
        else
        {
            if(is_valid_element)
            {
-                atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i], x);
+                atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
            }
        }
    }

    template <typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = true,
              typename std::enable_if<
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void atomic_max(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void
+    atomic_add_raw(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
+    {
+        // using scalar_t = typename vector_traits<remove_cvref_t<T>>::scalar_type;
+
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = vector_traits<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        static_assert(get_address_space() == address_space_enum::global, "only support global mem");
+
+        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+        amd_buffer_atomic_add_raw<remove_cvref_t<T>,
+                                  t_per_x,
+                                  Coherence,
+                                  oob_conditional_check,
+                                  pre_nop>(
+            x, p_data_, i, linear_offset, is_valid_element, buffer_size_);
+    }
+
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE void
+    atomic_max(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -572,11 +688,11 @@ struct buffer_view<address_space_enum::global,
        if constexpr(use_amd_buffer_addressing)
        {
            amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
-                x, p_data_, i, is_valid_element, buffer_size_);
+                x, p_data_, i + linear_offset, is_valid_element, buffer_size_);
        }
        else if(is_valid_element)
        {
-            atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i], x);
+            atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i + linear_offset], x);
        }
    }

@@ -668,8 +784,10 @@ struct buffer_view<address_space_enum::lds,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
-    get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
+    CK_TILE_DEVICE constexpr auto get(index_t i,
+                                      index_t linear_offset,
+                                      bool is_valid_element,
+                                      bool_constant<oob_conditional_check> = {}) const
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -684,14 +802,14 @@ struct buffer_view<address_space_enum::lds,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
            X tmp;

-            __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+            __builtin_memcpy(&tmp, &(p_data_[i + linear_offset]), sizeof(X));

            return tmp;
 #else
            using buf_t = ext_vector_t<typename vector_traits<remove_cvref_t<T>>::scalar_type,
                                       scalar_per_t_vector * scalar_per_x_vector>;
            // using buf_t = ushort __attribute__((ext_vector_type(8)));
-            auto rtn = *c_style_pointer_cast<const buf_t*>(&p_data_[i]);
+            auto rtn = *c_style_pointer_cast<const buf_t*>(&p_data_[i + linear_offset]);
            return bit_cast<X>(rtn);
 #endif
        }
@@ -708,6 +826,23 @@ struct buffer_view<address_space_enum::lds,
        }
    }

+    // i is offset of T, not X. i should be aligned to X
+    template <typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false,
+              typename std::enable_if<
+                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
+                  bool>::type = false>
+    CK_TILE_DEVICE constexpr auto get_raw(remove_cvref_t<X>& dst,
+                                          index_t v_offset,
+                                          index_t i_offset,
+                                          bool /*is_valid_element*/,
+                                          bool_constant<pre_nop> = {}) const
+    {
+        smem_load<sizeof(X)>{}(dst, v_offset * sizeof(T), i_offset * sizeof(T));
+    }
+
    // i is offset of T, not X. i should be aligned to X
    template <memory_operation_enum Op,
              typename X,
@@ -715,17 +850,17 @@ struct buffer_view<address_space_enum::lds,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        if constexpr(Op == memory_operation_enum::set)
        {
-            this->template set<X>(i, is_valid_element, x);
+            this->template set<X>(i, linear_offset, is_valid_element, x);
        }
        // FIXME: remove memory_operation_enum::add
        else if constexpr(Op == memory_operation_enum::add)
        {
-            auto tmp = this->template get<X>(i, is_valid_element);
-            this->template set<X>(i, is_valid_element, x + tmp);
+            auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
+            this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
        }
    }

@@ -735,7 +870,7 @@ struct buffer_view<address_space_enum::lds,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -751,6 +886,7 @@ struct buffer_view<address_space_enum::lds,
        bool constexpr workaround_int8_ds_write_issue = false;
 #endif

+        i += linear_offset; // simplicity
        if constexpr(std::is_same<typename vector_traits<remove_cvref_t<T>>::scalar_type,
                                  int8_t>::value &&
                     workaround_int8_ds_write_issue)
@@ -952,8 +1088,10 @@ struct buffer_view<address_space_enum::vgpr,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE constexpr auto
-    get(index_t i, bool is_valid_element, bool_constant<oob_conditional_check> = {}) const
+    CK_TILE_DEVICE constexpr auto get(index_t i,
+                                      index_t /*linear_offset*/,
+                                      bool is_valid_element,
+                                      bool_constant<oob_conditional_check> = {}) const
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -995,17 +1133,17 @@ struct buffer_view<address_space_enum::vgpr,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void update(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void update(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        if constexpr(Op == memory_operation_enum::set)
        {
-            this->template set<X>(i, is_valid_element, x);
+            this->template set<X>(i, linear_offset, is_valid_element, x);
        }
        // FIXME: remove memory_operation_enum::add
        else if constexpr(Op == memory_operation_enum::add)
        {
-            auto tmp = this->template get<X>(i, is_valid_element);
-            this->template set<X>(i, is_valid_element, x + tmp);
+            auto tmp = this->template get<X>(i, linear_offset, is_valid_element);
+            this->template set<X>(i, linear_offset, is_valid_element, x + tmp);
        }
    }

@@ -1015,7 +1153,7 @@ struct buffer_view<address_space_enum::vgpr,
                  std::is_same<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                               typename vector_traits<remove_cvref_t<T>>::scalar_type>::value,
                  bool>::type = false>
-    CK_TILE_DEVICE void set(index_t i, bool is_valid_element, const X& x)
+    CK_TILE_DEVICE void set(index_t i, index_t linear_offset, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = vector_traits<remove_cvref_t<T>>::vector_size;
@@ -1030,9 +1168,9 @@ struct buffer_view<address_space_enum::vgpr,
 #if CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
            X tmp = x;

-            __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+            __builtin_memcpy(&(p_data_[i + linear_offset]), &tmp, sizeof(X));
 #else
-            *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+            *c_style_pointer_cast<X*>(&p_data_[i + linear_offset]) = x;
 #endif
        }
    }

--- a/include/ck_tile/core/tensor/load_tile.hpp
+++ b/include/ck_tile/core/tensor/load_tile.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -12,6 +12,7 @@
 #include "ck_tile/core/tensor/tile_window.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
+#include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/tensor/null_tile_window.hpp"
 #include "ck_tile/core/tensor/null_tensor.hpp"

@@ -21,21 +22,85 @@ template <typename BottomTensorView_,
          typename WindowLengths_,
          typename TileDistribution_,
          index_t NumCoord,
+          index_t i_access           = -1,
          bool oob_conditional_check = true>
 CK_TILE_DEVICE auto load_tile(const tile_window_with_static_distribution<BottomTensorView_,
                                                                         WindowLengths_,
                                                                         TileDistribution_,
                                                                         NumCoord>& tile_window,
+                              number<i_access>                     = {},
                              bool_constant<oob_conditional_check> = {})
 {
-    return tile_window.load(bool_constant<oob_conditional_check>{});
+    return tile_window.load(number<i_access>{}, bool_constant<oob_conditional_check>{});
 }

+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true>
+CK_TILE_DEVICE auto load_tile(const tile_window_linear<BottomTensorView_,
+                                                       WindowLengths_,
+                                                       TileDistribution_,
+                                                       LinearBottomDims_>& tile_window,
+                              number<i_access>                     = {},
+                              bool_constant<oob_conditional_check> = {})
+{
+    return tile_window.load(number<i_access>{}, bool_constant<oob_conditional_check>{});
+}
+
+template <typename DistributedTensor_,
+          typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          index_t NumCoord,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true>
+CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile,
+                              const tile_window_with_static_distribution<BottomTensorView_,
+                                                                         WindowLengths_,
+                                                                         TileDistribution_,
+                                                                         NumCoord>& tile_window,
+                              number<i_access>                     = {},
+                              bool_constant<oob_conditional_check> = {})
+{
+    return tile_window.load(dst_tile, number<i_access>{}, bool_constant<oob_conditional_check>{});
+}
+
+template <typename DistributedTensor_,
+          typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true>
+CK_TILE_DEVICE auto load_tile(DistributedTensor_& dst_tile,
+                              const tile_window_linear<BottomTensorView_,
+                                                       WindowLengths_,
+                                                       TileDistribution_,
+                                                       LinearBottomDims_>& tile_window,
+                              number<i_access>                     = {},
+                              bool_constant<oob_conditional_check> = {})
+{
+    return tile_window.load(dst_tile, number<i_access>{}, bool_constant<oob_conditional_check>{});
+}
+
+/**
+ * @brief Loads a tile of data using inline assembly.
+ *
+ * @note Bare in mind that loading data this way, you have to manually initialize your
+ *       thread buffer and synchronize load afterwards in order to make sure it's done before
+ *       using loaded data from registers
+ *       @see `tile_window_with_static_distribution::init_raw()` and `buffer_view.hpp`
+ *       @see  `buffer_load_fence()`
+ */
 template <typename T,
          typename BottomTensorView_,
          typename WindowLengths_,
          typename TileDistribution_,
          index_t NumCoord,
+          index_t i_access           = -1,
          bool oob_conditional_check = true,
          bool pre_nop               = false>
 CK_TILE_DEVICE auto load_tile_raw(T& tile,
@@ -43,10 +108,33 @@ CK_TILE_DEVICE auto load_tile_raw(T& tile,
                                                                             WindowLengths_,
                                                                             TileDistribution_,
                                                                             NumCoord>& tile_window,
+                                  number<i_access>                     = {},
+                                  bool_constant<oob_conditional_check> = {},
+                                  bool_constant<pre_nop>               = {})
+{
+    tile_window.load_raw(
+        tile, number<i_access>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+}
+
+template <typename T,
+          typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true,
+          bool pre_nop               = false>
+CK_TILE_DEVICE auto load_tile_raw(T& tile,
+                                  const tile_window_linear<BottomTensorView_,
+                                                           WindowLengths_,
+                                                           TileDistribution_,
+                                                           LinearBottomDims_>& tile_window,
+                                  number<i_access>                     = {},
                                  bool_constant<oob_conditional_check> = {},
                                  bool_constant<pre_nop>               = {})
 {
-    tile_window.load_raw(tile, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+    tile_window.load_raw(
+        tile, number<i_access>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
 }

 template <typename LdsTileWindow_,
@@ -54,6 +142,7 @@ template <typename LdsTileWindow_,
          typename WindowLengths_,
          typename TileDistribution_,
          index_t NumCoord,
+          index_t i_access           = -1,
          bool oob_conditional_check = true,
          bool pre_nop               = false>
 CK_TILE_DEVICE auto
@@ -62,11 +151,37 @@ async_load_tile_raw(LdsTileWindow_&& lds_tile,
                                                               WindowLengths_,
                                                               TileDistribution_,
                                                               NumCoord>& tile_window,
+                    number<i_access>                     = {},
                    bool_constant<oob_conditional_check> = {},
                    bool_constant<pre_nop>               = {})
 {
-    return tile_window.async_load_raw(
-        lds_tile, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
+    return tile_window.async_load_raw(lds_tile,
+                                      number<i_access>{},
+                                      bool_constant<oob_conditional_check>{},
+                                      bool_constant<pre_nop>{});
+}
+
+template <typename LdsTileWindow_,
+          typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true,
+          bool pre_nop               = false>
+CK_TILE_DEVICE auto async_load_tile_raw(LdsTileWindow_&& lds_tile,
+                                        const tile_window_linear<BottomTensorView_,
+                                                                 WindowLengths_,
+                                                                 TileDistribution_,
+                                                                 LinearBottomDims_>& tile_window,
+                                        number<i_access>                     = {},
+                                        bool_constant<oob_conditional_check> = {},
+                                        bool_constant<pre_nop>               = {})
+{
+    return tile_window.async_load_raw(lds_tile,
+                                      number<i_access>{},
+                                      bool_constant<oob_conditional_check>{},
+                                      bool_constant<pre_nop>{});
 }

 CK_TILE_DEVICE auto async_load_fence(index_t cnt = 0)

--- a/include/ck_tile/core/tensor/null_tile_window.hpp
+++ b/include/ck_tile/core/tensor/null_tile_window.hpp
@@ -80,6 +80,13 @@ CK_TILE_DEVICE constexpr auto make_tile_window(null_tensor_view,
    return null_tile_window<remove_cvref_t<WindowLengths>>{window_lengths};
 }

+template <typename WindowLengths, typename StaticTileDistribution>
+CK_TILE_DEVICE constexpr auto make_tile_window(const null_tile_window<WindowLengths>& t,
+                                               const StaticTileDistribution&)
+{
+    return t;
+}
+
 template <typename WindowLengths>
 CK_TILE_DEVICE void
 move_tile_window(null_tile_window<WindowLengths>&,

--- a/include/ck_tile/core/tensor/shuffle_tile.hpp
+++ b/include/ck_tile/core/tensor/shuffle_tile.hpp
@@ -109,7 +109,7 @@ CK_TILE_DEVICE void shuffle_tile_impl_in_thread(OutTensor& out_tensor, const InT

        // get input vectors
        static_for<0, num_vec_in, 1>{}([&](auto i) {
-            constexpr auto idx_y_in = generate_array(
+            constexpr auto idx_y_in = generate_tuple(
                [&](auto ii) {
                    return ii == y_dim_vec_out ? idx_y_start[ii] + i : idx_y_start[ii];
                },
@@ -170,7 +170,7 @@ CK_TILE_DEVICE void shuffle_tile(OutTensor& out, const InTensor& in)
    }
    else
    {
-        // NOT implemented
+        static_assert(false, "The shuffle should always happen!");
    }
 }


--- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp
+++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
@@ -187,4 +187,44 @@ set_tile_if(static_distributed_tensor<DataType, StaticTileDistribution>& out_ten
    });
 }

+// this function used inside span loop over
+template <typename YLengths, index_t XUnpacks>
+CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks_from_x_unpacks(YLengths, number<XUnpacks>)
+{
+    constexpr auto y_size  = reduce_on_sequence(YLengths{}, multiplies{}, number<1>{});
+    constexpr auto y_packs = number<XUnpacks>{};
+    static_assert(y_size % y_packs == 0);
+    constexpr auto y_slice_size = y_size / y_packs;
+
+    constexpr auto slice_info = slice_sequence(YLengths{}, number<y_slice_size>{});
+    constexpr auto unpacks    = slice_info[number<1>{}];
+    return unpacks;
+}
+
+namespace detail {
+
+// check if 2 static_distributed_tensor has same data type and size of element
+// but only difference in distribution
+template <typename X, typename Y>
+struct is_similiar_distributed_tensor
+{
+    static constexpr bool value = false;
+};
+
+template <typename TypeX, typename DistX, typename TypeY, typename DistY>
+struct is_similiar_distributed_tensor<static_distributed_tensor<TypeX, DistX>,
+                                      static_distributed_tensor<TypeY, DistY>>
+{
+    using Tx                    = static_distributed_tensor<TypeX, DistX>;
+    using Ty                    = static_distributed_tensor<TypeY, DistY>;
+    static constexpr bool value = std::is_same_v<typename Tx::DataType, typename Ty::DataType> &&
+                                  Tx::get_thread_buffer_size() == Ty::get_thread_buffer_size();
+};
+
+template <typename X, typename Y>
+inline constexpr bool is_similiar_distributed_tensor_v =
+    is_similiar_distributed_tensor<X, Y>::value;
+
+} // namespace detail
+
 } // namespace ck_tile
--- a/include/ck_tile/core/tensor/store_tile.hpp
+++ b/include/ck_tile/core/tensor/store_tile.hpp
@@ -10,6 +10,7 @@
 #include "ck_tile/core/container/container_helper.hpp"
 #include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
+#include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"

 namespace ck_tile {
@@ -72,7 +73,7 @@ store_tile(tile_window_with_static_distribution<BottomTensorView_,
                                                NumCoord>& tile_window,
           const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
 {
-    tile_window.store(dstr_tensor);
+    tile_window.store(dstr_tensor, number<-1>{});
 }

 template <typename BottomTensorView_,
@@ -87,7 +88,33 @@ store_tile_raw(tile_window_with_static_distribution<BottomTensorView_,
                                                    NumCoord>& tile_window,
               const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
 {
-    tile_window.store_raw(dstr_tensor);
+    tile_window.store_raw(dstr_tensor, number<-1>{});
+}
+
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          typename DataType_>
+CK_TILE_DEVICE void store_tile(
+    tile_window_linear<BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_>&
+        tile_window,
+    const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
+{
+    tile_window.store(dstr_tensor, number<-1>{});
+}
+
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename LinearBottomDims_,
+          typename DataType_>
+CK_TILE_DEVICE void store_tile_raw(
+    tile_window_linear<BottomTensorView_, WindowLengths_, TileDistribution_, LinearBottomDims_>&
+        tile_window,
+    const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
+{
+    tile_window.store_raw(dstr_tensor, number<-1>{});
 }

 } // namespace ck_tile
--- a/include/ck_tile/core/tensor/sweep_tile.hpp
+++ b/include/ck_tile/core/tensor/sweep_tile.hpp
@@ -8,6 +8,7 @@
 #include "ck_tile/core/numeric/integral_constant.hpp"
 #include "ck_tile/core/tensor/tile_distribution.hpp"
 #include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/functional_with_tuple.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"

 namespace ck_tile {
@@ -27,4 +28,281 @@ CK_TILE_DEVICE void sweep_tile_span(TileDistributedSpan_, const F& f)
    });
 }

+// unpacked span, this version support span with unpack(multi-arg) functor
+//
+template <
+    typename TileDistributedSpan_, // tile_distributed_span<...>
+    typename F,                    // signature: F(tile_distributed_index<...>)
+    typename Unpacks = typename uniform_sequence_gen<TileDistributedSpan_::Impl::size(), 1>::type>
+CK_TILE_DEVICE void sweep_tile_uspan(TileDistributedSpan_, const F& f, Unpacks = {})
+{
+    using DstrSpan = remove_cvref_t<TileDistributedSpan_>;
+
+    static_uford<typename DstrSpan::Impl, Unpacks>{}(
+        [&](auto... dstr_idx_impl) { f(detail::make_tile_distributed_index(dstr_idx_impl)...); });
+}
+
+namespace impl {
+
+template <typename, typename, typename>
+struct sweep_tile_impl;
+
+template <typename DistributedTensor, typename UnpacksPerXDim, index_t I, index_t... Is>
+struct sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<I, Is...>>
+{
+    CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks() const
+    {
+        constexpr auto spans     = DistributedTensor::get_distributed_spans();
+        constexpr auto y_lengths = typename decltype(spans[number<I>{}])::Impl{};
+        constexpr auto x_unpacks = number<UnpacksPerXDim{}.at(number<I>{})>{};
+        constexpr auto y_unpacks = get_y_unpacks_from_x_unpacks(y_lengths, x_unpacks);
+        return y_unpacks;
+    }
+    CK_TILE_HOST_DEVICE constexpr index_t get_num_of_access() const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto u =
+            static_uford<typename decltype(spans[number<I>{}])::Impl, decltype(get_y_unpacks())>{};
+        return u.get_num_of_access() *
+               sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}
+                   .get_num_of_access();
+    }
+    template <typename F, typename SpanIdx>
+    CK_TILE_HOST_DEVICE constexpr void operator()(const F& f, const SpanIdx& span_idx) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+
+        sweep_tile_uspan(
+            spans[number<I>{}],
+            [&](auto... i_idx) {
+                const auto next_span_idx = embed_tuples(
+                    [&](auto si) { return make_tuple(concat_tuple(si, make_tuple(i_idx))...); },
+                    span_idx);
+                sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}(
+                    f, next_span_idx);
+            },
+            get_y_unpacks());
+    }
+    template <typename F, typename SpanIdx, index_t i_access>
+    CK_TILE_HOST_DEVICE constexpr void
+    operator()(const F& f, const SpanIdx& span_idx, number<i_access>) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto u =
+            static_uford<typename decltype(spans[number<I>{}])::Impl, decltype(get_y_unpacks())>{};
+        constexpr auto access_stride =
+            sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}
+                .get_num_of_access();
+        constexpr auto curr_i_access = number<i_access / access_stride>{};
+        constexpr auto next_i_access = number<i_access % access_stride>{};
+        u(
+            [&](auto... i_idx) {
+                const auto next_span_idx = embed_tuples(
+                    [&](auto si) {
+                        return make_tuple(concat_tuple(
+                            si, make_tuple(detail::make_tile_distributed_index(i_idx)))...);
+                    },
+                    span_idx);
+                sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}(
+                    f, next_span_idx, next_i_access);
+            },
+            curr_i_access);
+    }
+};
+
+template <typename DistributedTensor, typename UnpacksPerXDim>
+struct sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<>>
+{
+    CK_TILE_HOST_DEVICE constexpr index_t get_num_of_access() const { return 1; }
+    template <typename F, typename SpanIdx>
+    CK_TILE_HOST_DEVICE constexpr void operator()(const F& f, const SpanIdx& span_idx) const
+    {
+        unpack(f, span_idx);
+    }
+    template <typename F, typename SpanIdx, index_t i_access>
+    CK_TILE_HOST_DEVICE constexpr void
+    operator()(const F& f, const SpanIdx& span_idx, number<i_access>) const
+    {
+        unpack(f, span_idx);
+    }
+};
+
+template <typename, typename, typename>
+struct sweep_tile_impl_0;
+
+// TODO: support empty tuple to remove this "entry-point" like function
+template <typename DistributedTensor, typename UnpacksPerXDim, index_t I, index_t... Is>
+struct sweep_tile_impl_0<DistributedTensor, UnpacksPerXDim, sequence<I, Is...>>
+{
+    CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks() const
+    {
+        constexpr auto spans     = DistributedTensor::get_distributed_spans();
+        constexpr auto y_lengths = typename decltype(spans[number<I>{}])::Impl{};
+        constexpr auto x_unpacks = number<UnpacksPerXDim{}.at(number<I>{})>{};
+        constexpr auto y_unpacks = get_y_unpacks_from_x_unpacks(y_lengths, x_unpacks);
+        return y_unpacks;
+    }
+    CK_TILE_HOST_DEVICE constexpr index_t get_num_of_access() const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto u =
+            static_uford<typename decltype(spans[number<I>{}])::Impl, decltype(get_y_unpacks())>{};
+        return u.get_num_of_access() *
+               sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}
+                   .get_num_of_access();
+    }
+    template <typename F>
+    CK_TILE_HOST_DEVICE constexpr void operator()(const F& f) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        sweep_tile_uspan(
+            spans[number<I>{}],
+            [&](auto... i_idx) {
+                constexpr auto next_span_idx = make_tuple(make_tuple(i_idx)...);
+                sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}(
+                    f, next_span_idx);
+            },
+            get_y_unpacks());
+    }
+    template <typename F, index_t i_access>
+    CK_TILE_HOST_DEVICE constexpr void operator()(const F& f, number<i_access>) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto u =
+            static_uford<typename decltype(spans[number<I>{}])::Impl, decltype(get_y_unpacks())>{};
+        constexpr auto access_stride =
+            sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}
+                .get_num_of_access();
+        constexpr auto curr_i_access = number<i_access / access_stride>{};
+        constexpr auto next_i_access = number<i_access % access_stride>{};
+        u(
+            [&](auto... i_idx) {
+                constexpr auto next_span_idx =
+                    make_tuple(make_tuple(detail::make_tile_distributed_index(i_idx))...);
+                sweep_tile_impl<DistributedTensor, UnpacksPerXDim, sequence<Is...>>{}(
+                    f, next_span_idx, next_i_access);
+            },
+            curr_i_access);
+    }
+};
+
+} // namespace impl
+
+/*
+ * Enhanced sweep-tile utility, can control unpacks along each X-dim
+ * the lambda function argument is the distributed-idx, which can directly
+ * plugged into the distributed tensor as setter/getter
+ *
+ * e.g. below function, y with the type DistributedTensor, r is row scale
+ *
+ * // sweep tile 1 by 1
+ * sweep_tile<DistributedTensor>([&](auto idx) {
+ *     constexpr auto row_id = make_tuple(idx[number<0>{}]);
+ *     y(idx)                = y(idx) * r(row_id);
+ * });
+ *
+ * // sweep tile with 2 pixel from last dim each function call
+ * sweep_tile<DistributedTensor>(
+ *     [&](auto idx_0, auto idx_1) {
+ *         constexpr auto row_id = make_tuple(idx_0[number<0>{}]);
+ *         y(idx_0)              = y(idx_0) * r(row_id);
+ *         y(idx_1)              = y(idx_1) * r(row_id);
+ *     },
+ *     sequence<1, 2>{});
+ *
+ * // sweep tile with 2x2 pixel each function call
+ * sweep_tile<DistributedTensor>(
+ *     [&](auto idx_00, auto idx_01, auto idx_10, auto idx_11) {
+ *         constexpr auto row_id0 = make_tuple(idx_00[number<0>{}]);
+ *         constexpr auto row_id1 = make_tuple(idx_10[number<0>{}]);
+ *         y(idx_00)              = y(idx_00) * r(row_id0);
+ *         y(idx_01)              = y(idx_01) * r(row_id0);
+ *         y(idx_10)              = y(idx_10) * r(row_id1);
+ *         y(idx_11)              = y(idx_11) * r(row_id1);
+ *     },
+ *     sequence<2, 2>{});
+ *
+ * TODO: do we need constexpr? lambda function could be non-constexpr
+ */
+template <typename DistributedTensor,
+          typename F,
+          typename UnpacksPerXDim =
+              typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
+CK_TILE_HOST_DEVICE constexpr void sweep_tile(const F& f, UnpacksPerXDim = {})
+{
+    constexpr auto spans = DistributedTensor::get_distributed_spans();
+
+    impl::sweep_tile_impl_0<DistributedTensor,
+                            UnpacksPerXDim,
+                            typename arithmetic_sequence_gen<0, spans.size(), 1>::type>{}(f);
+}
+
+template <typename DistributedTensor,
+          typename F,
+          typename UnpacksPerXDim =
+              typename uniform_sequence_gen<DistributedTensor::get_num_of_dimension(), 1>::type>
+CK_TILE_HOST_DEVICE constexpr void
+sweep_tile(const DistributedTensor&, const F& f, UnpacksPerXDim = {})
+{
+    sweep_tile<DistributedTensor, F, UnpacksPerXDim>(f, UnpacksPerXDim{});
+}
+
+/*
+ * construct a sweep tile instance, which support issue the lambda one by one
+ * Note that this struct will hold the lambda functor, but will not hold the distributed tensor
+ * the functionality is the same as sweep_tile()
+ */
+template <typename DistributedTensor_,
+          typename F_,
+          typename UnpacksPerXDim_ =
+              typename uniform_sequence_gen<DistributedTensor_::get_num_of_dimension(), 1>::type>
+struct tile_sweeper
+{
+    using DistributedTensor = remove_cvref_t<DistributedTensor_>;
+    using F                 = remove_cvref_t<F_>;
+    using UnpacksPerXDim    = remove_cvref_t<UnpacksPerXDim_>;
+
+    CK_TILE_HOST_DEVICE tile_sweeper(const F& f_, UnpacksPerXDim = {}) : f(f_) {}
+    CK_TILE_HOST_DEVICE tile_sweeper(const DistributedTensor&, const F& f_, UnpacksPerXDim = {})
+        : f(f_)
+    {
+    }
+    CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_access()
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+        constexpr auto tmp =
+            impl::sweep_tile_impl_0<DistributedTensor,
+                                    UnpacksPerXDim,
+                                    typename arithmetic_sequence_gen<0, spans.size(), 1>::type>{};
+        return tmp.get_num_of_access();
+    }
+
+    CK_TILE_HOST_DEVICE void operator()() const
+    {
+        sweep_tile<DistributedTensor>(f, UnpacksPerXDim{});
+    }
+
+    template <index_t i_access>
+    CK_TILE_HOST_DEVICE void operator()(number<i_access>) const
+    {
+        constexpr auto spans = DistributedTensor::get_distributed_spans();
+
+        impl::sweep_tile_impl_0<DistributedTensor,
+                                UnpacksPerXDim,
+                                typename arithmetic_sequence_gen<0, spans.size(), 1>::type>{}(
+            f, number<i_access>{});
+    }
+    F f;
+};
+
+// partial deduction is not allowed
+// template <typename T, typename F, typename U>
+// CK_TILE_HOST_DEVICE_EXTERN tile_sweeper(const F&, U = {})->tile_sweeper<T, F, U>;
+
+// deduction guide
+template <typename T,
+          typename F,
+          typename U = typename uniform_sequence_gen<T::get_num_of_dimension(), 1>::type>
+CK_TILE_HOST_DEVICE_EXTERN tile_sweeper(const T&, const F&, U = {})->tile_sweeper<T, F, U>;
+
 } // namespace ck_tile
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -16,6 +16,24 @@

 namespace ck_tile {

+/*
+ * tensor_view
+ * abstract the underneath memory buffer(global, LDS, etc...)
+ * and provide a unified get/set function for access
+ *
+ * For addressing into the buffer we use 2 variable to control:
+ * coord : ND tensor coordinate, will calculate the actual offset inside
+ * linear_offset : 1D offset, will be used in the immediate field of
+ *   the buffer instruction to help reduce register usage
+ *
+ * User can use either of the field, or both to indexing into the tensor
+ *
+ * We usually provide 2 set of API for buffer get/set, e.g.
+ * get_vectorized_elements()/get_vectorized_elements_raw()
+ * the former usually will call intrinsic or normal C function, the later
+ * usually will call inline-asm function
+ *
+ */
 template <typename BufferView_,
          typename TensorDesc_,
          memory_operation_enum DstInMemOp_ = memory_operation_enum::set>
@@ -49,22 +67,6 @@ struct tensor_view

    CK_TILE_HOST_DEVICE constexpr auto& get_buffer_view() { return buf_; }

-#if 0
-    CK_TILE_HOST_DEVICE constexpr DataType get_element(const TensorCoord& coord) const
-    {
-        return buf_.template get<DataType>(
-            coord.get_offset(),
-            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord));
-    }
-
-    CK_TILE_HOST_DEVICE constexpr void set_element(const TensorCoord& coord, const DataType& x)
-    {
-        buf_.template set<DataType>(
-            coord.get_offset(),
-            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
-            x);
-    }
-#endif
    // X is vector of DataType.
    // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
    template <typename X,
@@ -75,14 +77,34 @@ struct tensor_view
                  bool>::type = false>
    CK_TILE_HOST_DEVICE constexpr remove_cvref_t<X>
    get_vectorized_elements(const TensorCoord& coord,
+                            index_t linear_offset,
                            bool_constant<oob_conditional_check> = {}) const
    {
        return buf_.template get<X>(
            coord.get_offset(),
+            linear_offset,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            bool_constant<oob_conditional_check>{});
    }

+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr remove_cvref_t<X>
+    get_vectorized_elements(const TensorCoord& coord,
+                            index_t linear_offset,
+                            bool is_valid_element, // flag
+                            bool_constant<oob_conditional_check> = {}) const
+    {
+        return buf_.template get<X>(coord.get_offset(),
+                                    linear_offset,
+                                    is_valid_element,
+                                    bool_constant<oob_conditional_check>{});
+    }
+
    // X is vector of DataType.
    // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
    template <typename X,
@@ -94,27 +116,109 @@ struct tensor_view
                  bool>::type = false>
    CK_TILE_HOST_DEVICE void get_vectorized_elements_raw(remove_cvref_t<X>& dst,
                                                         const TensorCoord& coord,
+                                                         index_t linear_offset,
                                                         bool_constant<oob_conditional_check> = {},
                                                         bool_constant<pre_nop> = {}) const
    {
        return buf_.template get_raw<X, oob_conditional_check, pre_nop>(
            dst,
            coord.get_offset(),
+            linear_offset,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            bool_constant<pre_nop>{});
    }

+    template <typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE void get_vectorized_elements_raw(remove_cvref_t<X>& dst,
+                                                         const TensorCoord& coord,
+                                                         index_t linear_offset,
+                                                         bool is_valid_element,
+                                                         bool_constant<oob_conditional_check> = {},
+                                                         bool_constant<pre_nop> = {}) const
+    {
+        return buf_.template get_raw<X, oob_conditional_check, pre_nop>(
+            dst, coord.get_offset(), linear_offset, is_valid_element, bool_constant<pre_nop>{});
+    }
+
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    async_get_vectorized_elements(CK_TILE_LDS_ADDR remove_cvref_t<DataType>* smem,
+                                  const TensorCoord& coord,
+                                  index_t linear_offset) const
+    {
+        return buf_.template async_get<X>(
+            smem,
+            coord.get_offset(),
+            linear_offset,
+            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
+            bool_constant<oob_conditional_check>{});
+    }
+
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    async_get_vectorized_elements(CK_TILE_LDS_ADDR remove_cvref_t<DataType>* smem,
+                                  const TensorCoord& coord,
+                                  index_t linear_offset,
+                                  bool is_valid_element) const
+    {
+        return buf_.template async_get<X>(smem,
+                                          coord.get_offset(),
+                                          linear_offset,
+                                          is_valid_element,
+                                          bool_constant<oob_conditional_check>{});
+    }
+
    template <typename X,
              bool pre_nop = false,
              typename std::enable_if<
                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
                  bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void async_get_vectorized_elements_raw(
-        remove_cvref_t<DataType>* smem, const TensorCoord& coord, bool_constant<pre_nop> = {}) const
+    CK_TILE_HOST_DEVICE constexpr void
+    async_get_vectorized_elements_raw(remove_cvref_t<DataType>* smem,
+                                      const TensorCoord& coord,
+                                      index_t linear_offset,
+                                      bool_constant<pre_nop> = {}) const
    {
        return buf_.template async_get_raw<X>(
-            smem, coord.get_offset(), true /*not used*/, bool_constant<pre_nop>{});
+            smem,
+            coord.get_offset(),
+            linear_offset,
+            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
+            bool_constant<pre_nop>{});
+    }
+
+    template <typename X,
+              bool pre_nop = false,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    async_get_vectorized_elements_raw(remove_cvref_t<DataType>* smem,
+                                      const TensorCoord& coord,
+                                      index_t linear_offset,
+                                      bool is_valid_element,
+                                      bool_constant<pre_nop> = {}) const
+    {
+        return buf_.template async_get_raw<X>(
+            smem, coord.get_offset(), linear_offset, is_valid_element, bool_constant<pre_nop>{});
    }

    // X is vector of DataType.
@@ -125,11 +229,15 @@ struct tensor_view
                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
                  bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void set_vectorized_elements(
-        const TensorCoord& coord, const X& x, bool_constant<oob_conditional_check> = {})
+    CK_TILE_HOST_DEVICE constexpr void
+    set_vectorized_elements(const TensorCoord& coord,
+                            index_t linear_offset,
+                            const X& x,
+                            bool_constant<oob_conditional_check> = {})
    {
        buf_.template set<X, oob_conditional_check>(
            coord.get_offset(),
+            linear_offset,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            x);
    }
@@ -140,15 +248,53 @@ struct tensor_view
                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
                  bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void set_vectorized_elements_raw(
-        const TensorCoord& coord, const X& x, bool_constant<oob_conditional_check> = {})
+    CK_TILE_HOST_DEVICE constexpr void
+    set_vectorized_elements(const TensorCoord& coord,
+                            index_t linear_offset,
+                            bool is_valid_element,
+                            const X& x,
+                            bool_constant<oob_conditional_check> = {})
+    {
+        buf_.template set<X, oob_conditional_check>(
+            coord.get_offset(), linear_offset, is_valid_element, x);
+    }
+
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    set_vectorized_elements_raw(const TensorCoord& coord,
+                                index_t linear_offset,
+                                const X& x,
+                                bool_constant<oob_conditional_check> = {})
    {
        buf_.template set_raw<X, oob_conditional_check>(
            coord.get_offset(),
+            linear_offset,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            x);
    }

+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    set_vectorized_elements_raw(const TensorCoord& coord,
+                                index_t linear_offset,
+                                bool is_valid_element,
+                                const X& x,
+                                bool_constant<oob_conditional_check> = {})
+    {
+        buf_.template set_raw<X, oob_conditional_check>(
+            coord.get_offset(), linear_offset, is_valid_element, x);
+    }
+
    // X is vector of DataType.
    // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
    template <typename X,
@@ -157,15 +303,78 @@ struct tensor_view
                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
                  bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void update_vectorized_elements(
-        const TensorCoord& coord, const X& x, bool_constant<oob_conditional_check> = {})
+    CK_TILE_HOST_DEVICE constexpr void
+    update_vectorized_elements(const TensorCoord& coord,
+                               index_t linear_offset,
+                               const X& x,
+                               bool_constant<oob_conditional_check> = {})
+    {
+        buf_.template update<DstInMemOp, X, oob_conditional_check>(
+            coord.get_offset(),
+            linear_offset,
+            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
+            x);
+    }
+
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    update_vectorized_elements(const TensorCoord& coord,
+                               index_t linear_offset,
+                               bool is_valid_element,
+                               const X& x,
+                               bool_constant<oob_conditional_check> = {})
    {
        buf_.template update<DstInMemOp, X, oob_conditional_check>(
+            coord.get_offset(), linear_offset, is_valid_element, x);
+    }
+
+    // X is vector of DataType.
+    // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
+    template <typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    update_vectorized_elements_raw(const TensorCoord& coord,
+                                   index_t linear_offset,
+                                   const X& x,
+                                   bool_constant<oob_conditional_check> = {},
+                                   bool_constant<pre_nop>               = {})
+    {
+        buf_.template update_raw<DstInMemOp, X, oob_conditional_check, pre_nop>(
            coord.get_offset(),
+            linear_offset,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            x);
    }

+    template <typename X,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void
+    update_vectorized_elements_raw(const TensorCoord& coord,
+                                   index_t linear_offset,
+                                   bool is_valid_element,
+                                   const X& x,
+                                   bool_constant<oob_conditional_check> = {},
+                                   bool_constant<pre_nop>               = {})
+    {
+        buf_.template update_raw<DstInMemOp, X, oob_conditional_check, pre_nop>(
+            coord.get_offset(), linear_offset, is_valid_element, x);
+    }
+
    CK_TILE_HOST_DEVICE void print() const
    {
        printf("tensor_view{");

--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -17,6 +17,14 @@

 namespace ck_tile {

+namespace detail {
+template <typename Distribution>
+CK_TILE_HOST_DEVICE auto get_partition_index(Distribution)
+{
+    return Distribution::_get_partition_index();
+}
+} // namespace detail
+
 // distributed span
 template <index_t... PartialHsLengths>
 struct tile_distributed_span
@@ -83,6 +91,21 @@ struct tile_distribution
    CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_dimension_p() { return NDimP; }
    CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_dimension_r() { return NDimR; }

+    CK_TILE_HOST_DEVICE static auto _get_partition_index()
+    {
+        // only support warp-tile and block-tile
+        static_assert(NDimP == 1 or NDimP == 2, "wrong!");
+
+        if constexpr(NDimP == 1)
+        {
+            return array<index_t, 1>{get_lane_id()};
+        }
+        else if constexpr(NDimP == 2)
+        {
+            return array<index_t, 2>{get_warp_id(), get_lane_id()};
+        }
+    }
+
    CK_TILE_HOST_DEVICE static constexpr auto get_lengths()
    {
 #if 0
@@ -149,6 +172,16 @@ struct tile_distribution
    }
 #endif

+    template <typename PartitionIndex = decltype(_get_partition_index())>
+    CK_TILE_HOST_DEVICE auto
+    calculate_index(const PartitionIndex& ps_idx = _get_partition_index()) const
+    {
+        const auto ps_ys_idx = container_concat(ps_idx, array<index_t, NDimY>{0});
+        const auto window_adaptor_thread_coord_tmp =
+            make_tensor_adaptor_coordinate(ps_ys_to_xs_, ps_ys_idx);
+        return window_adaptor_thread_coord_tmp.get_bottom_index();
+    }
+
    CK_TILE_HOST_DEVICE static constexpr auto get_distributed_spans()
    {
        constexpr auto distributed_spans_impl = DstrEncode::detail::distributed_spans_lengthss_;
@@ -421,6 +454,7 @@ struct tile_distribution_detail

 } // namespace detail

+#if 0
 // this returns a constexpr tile_distribution
 template <typename StaticTileDistributionEncoding_>
 CK_TILE_HOST_DEVICE constexpr auto make_tile_distribution(StaticTileDistributionEncoding_)
@@ -457,6 +491,7 @@ CK_TILE_HOST_DEVICE constexpr auto make_tile_distribution(StaticTileDistribution
        detail::tile_distribution_detail<remove_cvref_t<decltype(rh_major_minor_to_hidden_ids)>>>{
        ps_ys_to_xs_adaptor, ys_to_d_descriptor};
 }
+#endif

 // this returns a static tile_distribution
 template <typename StaticTileDistributionEncoding_>
@@ -499,129 +534,6 @@ CK_TILE_HOST_DEVICE constexpr auto make_static_tile_distribution(StaticTileDistr
 //***********************************************************************************

 namespace detail {
-
-template <typename Distribution>
-CK_TILE_HOST_DEVICE auto get_partition_index(Distribution)
-{
-    // only support warp-tile and block-tile
-    static_assert(Distribution::NDimP == 1 or Distribution::NDimP == 2, "wrong!");
-
-    if constexpr(Distribution::NDimP == 1)
-    {
-        return array<index_t, 1>{get_lane_id()};
-    }
-    else if constexpr(Distribution::NDimP == 2)
-    {
-        return array<index_t, 2>{get_warp_id(), get_lane_id()};
-    }
-}
-
-template <typename, typename, typename, index_t>
-struct reverse_slice_sequence_impl;
-
-template <index_t x,
-          index_t... xs,
-          index_t m,
-          index_t... ms,
-          index_t id,
-          index_t... ids,
-          index_t SliceSize>
-struct reverse_slice_sequence_impl<sequence<x, xs...>,
-                                   sequence<m, ms...>,
-                                   sequence<id, ids...>,
-                                   SliceSize>
-{
-    using old_scan =
-        reverse_slice_sequence_impl<sequence<xs...>, sequence<ms...>, sequence<ids...>, SliceSize>;
-
-    static constexpr auto slice_size = old_scan::remaining_slice_sizes::front().value;
-    static constexpr auto slice_length =
-        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
-
-    using dim_lengths =
-        typename sequence_merge<sequence<slice_length>, typename old_scan::dim_lengths>::type;
-    using dim_slices =
-        typename sequence_merge<sequence<x / slice_length>, typename old_scan::dim_slices>::type;
-    using remaining_slice_sizes = typename sequence_merge<
-        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>,
-        typename old_scan::remaining_slice_sizes>::type;
-
-    // the first idx that sliced length not equal to original length
-    static constexpr index_t _flag =
-        slice_length != x && remaining_slice_sizes{}.front().value == 1;
-    static constexpr index_t _split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
-    static constexpr index_t _split_idx =
-        std::conditional_t<_split_flag, number<id>, number<0>>::value;
-
-    static constexpr index_t split_flag = _split_flag || old_scan::split_flag;
-    static constexpr index_t split_idx  = std::
-        conditional_t<old_scan::split_flag, number<old_scan::split_idx>, number<_split_idx>>::value;
-};
-
-template <index_t x, index_t m, index_t id, index_t SliceSize>
-struct reverse_slice_sequence_impl<sequence<x>, sequence<m>, sequence<id>, SliceSize>
-{
-    static constexpr auto slice_size = SliceSize;
-    static constexpr auto slice_length =
-        std::conditional_t<m, number<gcd(x, slice_size)>, number<x>>::value;
-
-    using dim_lengths = sequence<slice_length>;
-    using dim_slices  = sequence<x / slice_length>;
-    using remaining_slice_sizes =
-        std::conditional_t<m, sequence<slice_size / slice_length>, sequence<slice_size>>;
-
-    // the first idx that sliced length not equal to original length
-    static constexpr index_t _flag =
-        slice_length != x && remaining_slice_sizes{}.front().value == 1;
-    static constexpr index_t split_flag = std::conditional_t<m, number<_flag>, number<0>>::value;
-    static constexpr index_t split_idx =
-        std::conditional_t<split_flag, number<id>, number<0>>::value;
-};
-
-// clang-format off
-// input a sequence(with optional mask), and the SliceSize : size per slice
-// output the sequence each slice, and number of slices
-//
-// e.g. <2, 1, 4, 2>, 8     -> lengths:<1, 1, 4, 2>    , nums: <2, 1, 1, 1>    : 2 slices  , slice_idx: 0
-//      <4, 2, 4, 1, 2>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 1> : 16 slices , slice_idx: 2
-//      <4, 2, 4, 1, 6>, 4  -> lengths:<1, 1, 2, 1, 2> , nums: <4, 2, 2, 1, 3> : 48 slices , slice_idx: 2
-//      <4, 2, 5, 1, 2>, 10 -> lengths:<1, 1, 5, 1, 2> , nums: <4, 2, 1, 1, 1> : 8 slices  , slice_idx: 1
-//
-//      <4, 2, 8>, 64       -> lengths:<4, 2, 8>       , nums: <1, 1, 1>       : 1  slices , slice_idx: 0
-//      <4, 2, 8>, 32       -> lengths:<2, 2, 8>       , nums: <2, 1, 1>       : 2  slices , slice_idx: 0
-//      <4, 2, 8>, 16       -> lengths:<1, 2, 8>       , nums: <4, 1, 1>       : 4  slices , slice_idx: 0
-//      <4, 2, 8>, 8        -> lengths:<1, 1, 8>       , nums: <4, 2, 1>       : 8  slices , slice_idx: 1
-//      <4, 2, 8>, 4        -> lengths:<1, 1, 4>       , nums: <4, 2, 2>       : 16 slices , slice_idx: 2
-//      <4, 2, 8>, 2        -> lengths:<1, 1, 2>       , nums: <4, 2, 4>       : 32 slices , slice_idx: 2
-//      <4, 2, 8>, 1        -> lengths:<1, 1, 1>       , nums: <4, 2, 8>       : 64 slices , slice_idx: 2
-//
-//      <4, 2, 1, 4, 2> / 4 ->
-// mask:<1, 1, 1, 0, 1>,    -> lengths:<1, 2, 1, 4, 2> , nums: <4, 1, 1, 1, 1> : 8 slices  , slice_idx: 0
-//
-// return tuple<slice_lengths, slice_nums, slice_index>, slice_index is at which index will start
-// have split slices (right -> left)
-//  or the first index that sliced length is different from the original length
-// clang-format on
-template <typename Seq,
-          index_t SliceSize,
-          typename Mask = typename uniform_sequence_gen<Seq::size(), 1>::type>
-constexpr auto reverse_slice_sequence(Seq,
-                                      number<SliceSize>,
-                                      Mask = typename uniform_sequence_gen<Seq::size(), 1>::type{})
-{
-    static_assert(Seq::size() == Mask::size());
-    using sliced_type =
-        reverse_slice_sequence_impl<Seq,
-                                    Mask,
-                                    typename arithmetic_sequence_gen<0, Seq::size(), 1>::type,
-                                    SliceSize>;
-    static_assert(sliced_type::remaining_slice_sizes::front().value == 1,
-                  "can not evenly divide this sequence, please check");
-    return make_tuple(typename sliced_type::dim_lengths{},
-                      typename sliced_type::dim_slices{},
-                      number<sliced_type::split_idx>{});
-}
-
 //
 // slice tensor from x_dim, result in split in y_dim, not p_dim.
 // We don't support slice cross p_dim (aka, slice different threads)

--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -18,6 +18,8 @@

 namespace ck_tile {

+// Note: this tile window do not support single issue
+// you need to use tile_window_linear structure for this purpose
 template <typename BottomTensorView_,
          typename WindowLengths_,
          typename StaticTileDistribution_,
@@ -41,6 +43,7 @@ struct tile_window_with_static_distribution

    static constexpr auto I0 = number<0>{};
    static constexpr auto I1 = number<1>{};
+    static_assert(NumCoord == 1);

    // TODO: check WindowLengths and StaticTileDistribution are consistent

@@ -189,7 +192,8 @@ struct tile_window_with_static_distribution
            constexpr auto idx_diff_ys =
                SFC_Ys::get_step_between(number<0>{}, number<iCoord * NumAccessPerCoord>{});

-            constexpr auto idx_diff_ps_ys = container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+            constexpr auto idx_diff_ps_ys = container_concat(
+                generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}), idx_diff_ys);

            move_window_adaptor_and_bottom_tensor_thread_coordinate(
                window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -222,10 +226,11 @@ struct tile_window_with_static_distribution

    // move thread's window adaptor coordinate and bottom tensor coordinate
    // [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] ==> [x0', x1', ...] ==> [offset]
+    template <typename ATopIndex>
    CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(
        WindowAdaptorCoord& window_adaptor_thread_coord,
        BottomTensorCoord& bottom_tensor_thread_coord,
-        const AdaptorTopIndex& idx_diff_adaptor_top) const
+        const ATopIndex& idx_diff_adaptor_top) const
    {
        array<index_t, NDimBottomTensor> idx_diff_adaptor_bottom;

@@ -279,20 +284,31 @@ struct tile_window_with_static_distribution
                          get_container_subset(window_adaptor_ps_ys_vector_strides, y_dims));
    }

-    CK_TILE_DEVICE constexpr auto get_num_access() const { return load_store_traits::NumAccess; }
+    CK_TILE_DEVICE constexpr auto get_num_of_access() const { return load_store_traits::NumAccess; }

-    template <bool oob_conditional_check = true>
-    CK_TILE_DEVICE auto load(bool_constant<oob_conditional_check> = {}) const
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load(number<i_access_unsupport_>          = {},
+                             bool_constant<oob_conditional_check> = {}) const
    {
-        using Traits = load_store_traits;
+        constexpr auto tile_dstr = TileDstr{};
+        auto dst_tensor          = make_static_distributed_tensor<DataType>(tile_dstr);
+        load(dst_tensor, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
+        return dst_tensor;
+    }

+    template <typename DistributedTensor,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true>
+    CK_TILE_DEVICE auto load(DistributedTensor& dst_tensor,
+                             number<i_access_unsupport_>          = {},
+                             bool_constant<oob_conditional_check> = {}) const
+    {
+        using Traits   = load_store_traits;
        using vector_t = typename Traits::vector_t;
        using SFC_Ys   = typename Traits::SFC_Ys;

        constexpr auto tile_dstr = TileDstr{};

-        auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
-
        // loop over thread tensor space [y0, y1, ...]
        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
            /// TODO: use structure binding (to be captured later) if compiled in C++20
@@ -308,11 +324,11 @@ struct tile_window_with_static_distribution
                // read from bottom tensor
                const vector_t vec_value =
                    get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
-                        bottom_tensor_thread_coord, bool_constant<oob_conditional_check>{});
+                        bottom_tensor_thread_coord, 0, bool_constant<oob_conditional_check>{});
 #if 1
                // write into distributed tensor
                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
-                    constexpr auto idx_ys = generate_array(
+                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                            : idx_ys_start[jj];
@@ -338,20 +354,23 @@ struct tile_window_with_static_distribution
                {
                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);

-                    constexpr auto idx_diff_ps_ys =
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);

                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
                }
            });
        });
-
-        return dst_tensor;
    }

-    template <typename DstTile, bool oob_conditional_check = true, bool pre_nop = false>
+    template <typename DstTile,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true,
+              bool pre_nop                = false>
    CK_TILE_DEVICE void load_raw(DstTile& dst_tensor,
+                                 number<i_access_unsupport_>          = {},
                                 bool_constant<oob_conditional_check> = {},
                                 bool_constant<pre_nop>               = {}) const
    {
@@ -397,6 +416,7 @@ struct tile_window_with_static_distribution
                get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
                    dst_vec_tbuf.template at<d / Traits::ScalarPerVector>(),
                    bottom_tensor_thread_coord,
+                    0 /**/,
                    bool_constant<oob_conditional_check>{},
                    pre_nop_);
 #if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \
@@ -409,23 +429,24 @@ struct tile_window_with_static_distribution
                {
                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);

-                    constexpr auto idx_diff_ps_ys =
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);

                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
                }
            });
        });
-#if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE
-        asm volatile("; this inline asm is workaround to prevent compiler from using too much "
-                     "scratch memory" ::);
-#endif
    }

    // TODO: currently async load only implemented in inline asm
-    template <typename LdsTileWindow_, bool oob_conditional_check = true, bool pre_nop = false>
+    template <typename LdsTileWindow_,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true,
+              bool pre_nop                = false>
    CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_&& lds_tile,
+                                       number<i_access_unsupport_>          = {},
                                       bool_constant<oob_conditional_check> = {},
                                       bool_constant<pre_nop>               = {}) const
    {
@@ -467,7 +488,7 @@ struct tile_window_with_static_distribution

        // loop over thread tensor space [y0, y1, ...]
        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
-            // TODO: use structure binding (to be captured later) if compiled in C++20
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];

@@ -482,15 +503,16 @@ struct tile_window_with_static_distribution

                // read from bottom tensor
                get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
-                    smem, bottom_tensor_thread_coord, pre_nop_);
+                    smem, bottom_tensor_thread_coord, 0, pre_nop_);

                // move thread coordinate
                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                {
                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);

-                    constexpr auto idx_diff_ps_ys =
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);

                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -501,8 +523,81 @@ struct tile_window_with_static_distribution
        });
    }

-    template <bool oob_conditional_check = true>
+    template <typename LdsTileWindow_,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true>
+    CK_TILE_DEVICE auto async_load(LdsTileWindow_&& lds_tile,
+                                   number<i_access_unsupport_>          = {},
+                                   bool_constant<oob_conditional_check> = {}) const
+    {
+        using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
+        using LdsDataType   = typename LdsTileWindow::DataType;
+
+        // issues * warps * lanes
+        static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+
+        // TODO: LDS offset is not good for intrinsic based implementation(compiler can't figure out
+        // dependency) hence avoid use offset based solution. size_per_buf should be zero (how to
+        // check?)
+        constexpr index_t size_per_buf =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<0>{}, number<0>{}));
+
+        constexpr index_t size_per_wave =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<1>{}, number<0>{})) -
+            size_per_buf;
+
+        constexpr index_t size_per_issue =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<1>{}, number<0>{}, number<0>{})) -
+            size_per_buf;
+
+        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
+
+        using Traits = load_store_traits;
+
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
+
+        // TODO: we force CK_TILE_LDS_ADDR
+        CK_TILE_LDS_ADDR LdsDataType* smem =
+            lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_ + m0_init_value;
+
+        // loop over thread tensor space [y0, y1, ...]
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
+            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
+            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
+
+            static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
+                constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
+
+                // read from bottom tensor
+                get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
+                    smem, bottom_tensor_thread_coord, 0, bool_constant<oob_conditional_check>{});
+
+                // move thread coordinate
+                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
+                {
+                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
+
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
+
+                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+
+                    smem += size_per_issue; // Note we manually increase the per-issue offset
+                }
+            });
+        });
+    }
+
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
    CK_TILE_DEVICE void store(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                              number<i_access_unsupport_>          = {},
                              bool_constant<oob_conditional_check> = {}) const
    {
        using Traits = load_store_traits;
@@ -515,7 +610,6 @@ struct tile_window_with_static_distribution

        // loop over thread tensor space [y0, y1, ...]
        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
-            /// TODO: use structure binding (to be captured later) if compiled in C++20
            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];

@@ -530,7 +624,7 @@ struct tile_window_with_static_distribution
                vector_t vec_value;

                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
-                    constexpr auto idx_ys = generate_array(
+                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                            : idx_ys_start[jj];
@@ -548,15 +642,19 @@ struct tile_window_with_static_distribution

                // write into bottom tensor
                get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
-                    bottom_tensor_thread_coord, vec_value, bool_constant<oob_conditional_check>{});
+                    bottom_tensor_thread_coord,
+                    0,
+                    vec_value,
+                    bool_constant<oob_conditional_check>{});

                // move thread coordinate
                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                {
                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);

-                    constexpr auto idx_diff_ps_ys =
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);

                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -565,8 +663,9 @@ struct tile_window_with_static_distribution
        });
    }

-    CK_TILE_DEVICE void
-    store_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor) const
+    template <index_t i_access_unsupport_ = -1>
+    CK_TILE_DEVICE void store_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                                  number<i_access_unsupport_> = {}) const
    {
        using Traits = load_store_traits;

@@ -591,7 +690,7 @@ struct tile_window_with_static_distribution
                // read from distributed tensor
                vector_t vec_value;
                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
-                    constexpr auto idx_ys = generate_array(
+                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                            : idx_ys_start[jj];
@@ -606,15 +705,16 @@ struct tile_window_with_static_distribution
                // write into bottom tensor
                get_bottom_tensor_view()
                    .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
-                        bottom_tensor_thread_coord, vec_value);
+                        bottom_tensor_thread_coord, 0, vec_value);

                // move thread coordinate
                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                {
                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);

-                    constexpr auto idx_diff_ps_ys =
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);

                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -623,8 +723,9 @@ struct tile_window_with_static_distribution
        });
    }

-    template <bool oob_conditional_check = true>
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
    CK_TILE_DEVICE void update(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                               number<i_access_unsupport_>          = {},
                               bool_constant<oob_conditional_check> = {}) const
    {
        using Traits = load_store_traits;
@@ -650,7 +751,7 @@ struct tile_window_with_static_distribution
                vector_t vec_value;

                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
-                    constexpr auto idx_ys = generate_array(
+                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
                                                            : idx_ys_start[jj];
@@ -666,15 +767,86 @@ struct tile_window_with_static_distribution

                // write into bottom tensor
                get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
-                    bottom_tensor_thread_coord, vec_value, bool_constant<oob_conditional_check>{});
+                    bottom_tensor_thread_coord,
+                    0,
+                    vec_value,
+                    bool_constant<oob_conditional_check>{});

                // move thread coordinate
                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
                {
                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);

-                    constexpr auto idx_diff_ps_ys =
-                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);
+
+                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+                }
+            });
+        });
+    }
+
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true, bool pre_nop>
+    CK_TILE_DEVICE void update_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                                   number<i_access_unsupport_>          = {},
+                                   bool_constant<oob_conditional_check> = {},
+                                   bool_constant<pre_nop>               = {}) const
+    {
+        using Traits = load_store_traits;
+
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        // loop over thread tensor space [y0, y1, ...]
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
+            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
+            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
+
+            static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
+                constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
+
+                // data index [y0, y1, ...]
+                constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
+
+                // read from distributed tensor
+                vector_t vec_value;
+
+                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                    constexpr auto idx_ys = generate_tuple(
+                        [&](auto jj) {
+                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
+                                                            : idx_ys_start[jj];
+                        },
+                        number<NDimY>{});
+
+                    constexpr index_t d =
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                    vec_value.template get_as<DataType>()(j) =
+                        dstr_tensor.get_thread_buffer().template at<d>();
+                });
+
+                // write into bottom tensor
+                get_bottom_tensor_view().template update_vectorized_elements_raw<vector_t>(
+                    bottom_tensor_thread_coord,
+                    0,
+                    vec_value,
+                    bool_constant<oob_conditional_check>{},
+                    bool_constant<pre_nop>{});
+
+                // move thread coordinate
+                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
+                {
+                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
+
+                    constexpr auto idx_diff_ps_ys = container_concat(
+                        generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                        idx_diff_ys);

                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -746,7 +918,8 @@ struct tile_window_with_static_distribution
            constexpr auto idx_diff_ys =
                SFC_Ys::get_step_between(number<0>{}, number<iCoord * NumAccessPerCoord>{});

-            constexpr auto idx_diff_ps_ys = container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+            constexpr auto idx_diff_ps_ys = container_concat(
+                generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}), idx_diff_ys);

            move_window_adaptor_and_bottom_tensor_thread_coordinate(
                window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
@@ -798,6 +971,27 @@ make_tile_window(const TensorView_& tensor_view,
        tensor_view, window_lengths, origin, tile_distribution};
 }

+// this version can't be called in a constexpr context
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          index_t NumCoord = 1>
+CK_TILE_DEVICE auto
+make_tile_window_raw(const TensorView_& tensor_view,
+                     const WindowLengths_& window_lengths,
+                     const multi_index<TensorView_::get_num_of_dimension()>& origin,
+                     const StaticTileDistribution_& tile_distribution,
+                     number<NumCoord> = {})
+{
+    auto w = tile_window_with_static_distribution<remove_cvref_t<TensorView_>,
+                                                  remove_cvref_t<WindowLengths_>,
+                                                  remove_cvref_t<StaticTileDistribution_>,
+                                                  NumCoord>{
+        tensor_view, window_lengths, origin, tile_distribution};
+    w.init_raw();
+    return w;
+}
+
 template <typename TensorView_,
          typename WindowLengths_,
          typename StaticTileDistribution_,
@@ -922,6 +1116,19 @@ make_tile_window(const tile_window_with_static_lengths<TensorView, WindowLengths
                            tile_distribution);
 }

+template <typename TensorView, typename WindowLengths, typename StaticTileDistribution>
+CK_TILE_DEVICE constexpr auto
+make_tile_window_raw(const tile_window_with_static_lengths<TensorView, WindowLengths>& tile_window,
+                     const StaticTileDistribution& tile_distribution)
+{
+    auto w = make_tile_window(tile_window.get_bottom_tensor_view(),
+                              tile_window.get_window_lengths(),
+                              tile_window.get_window_origin(),
+                              tile_distribution);
+    w.init_raw();
+    return w;
+}
+
 template <typename TensorView_, typename WindowLengths_>
 CK_TILE_DEVICE void move_tile_window(
    tile_window_with_static_lengths<TensorView_, WindowLengths_>& window,

--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/core/arch/utility.hpp"
+#include "ck_tile/core/algorithm/space_filling_curve.hpp"
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/container/array.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/tensor/static_distributed_tensor.hpp"
+#include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/tensor/tile_distribution.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+#define WINDOW_DISPATCH_ISSUE()                                     \
+    if constexpr(i_access < 0)                                      \
+    {                                                               \
+        static_for<0, NumAccess, 1>{}([&](auto ia) { issue(ia); }); \
+    }                                                               \
+    else                                                            \
+    {                                                               \
+        static_assert(i_access < NumAccess);                        \
+        issue(number<i_access>{});                                  \
+    }
+
+//
+// This version of tile window will pre-cache offset/flags based on need
+//
+// LinearBottomDims_, e.g seq<0, 1> for 2d tensor, the last one is linear dim
+// so last dim can use immediate offset to indexing, can save register
+// TODO: if using this struct, better use load_raw()/store_raw(), can control
+//       the the immediate offset on the fly
+// space-filing-curve is non-snaked here!
+//
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_>
+struct tile_window_linear
+{
+    using BottomTensorView = remove_reference_t<BottomTensorView_>;
+    using WindowLengths    = remove_cvref_t<WindowLengths_>;
+    using TileDstr         = remove_cvref_t<StaticTileDistribution_>;
+
+    using WindowAdaptor    = typename TileDstr::PsYs2XsAdaptor;
+    using BottomTensorDesc = typename BottomTensorView::TensorDesc;
+
+    using DataType         = remove_cvref_t<typename BottomTensorView::DataType>;
+    using LinearBottomDims = remove_cvref_t<LinearBottomDims_>;
+
+    static_assert(LinearBottomDims::size() == BottomTensorView::get_num_of_dimension());
+
+    static constexpr index_t NDimWindowAdaptorTop = WindowAdaptor::get_num_of_top_dimension();
+    static constexpr index_t NDimBottomTensor     = BottomTensorDesc::get_num_of_dimension();
+
+    static constexpr index_t NDimP = TileDstr::get_num_of_dimension_p();
+    static constexpr index_t NDimY = TileDstr::get_num_of_dimension_y();
+
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+
+    // TODO: check WindowLengths and StaticTileDistribution are consistent
+
+    static_assert(ck_tile::is_known_at_compile_time<WindowLengths>::value,
+                  "wrong! lengths should be static");
+    static_assert(TileDstr::is_static(), "wrong!");
+
+    static_assert(NDimBottomTensor == WindowAdaptor::get_num_of_bottom_dimension(),
+                  "wrong! inconsistent # of diemsnions");
+
+    using AdaptorTopIndex   = array<index_t, NDimWindowAdaptorTop>;
+    using BottomTensorIndex = array<index_t, NDimBottomTensor>;
+
+    using WindowAdaptorCoord =
+        decltype(make_tensor_adaptor_coordinate(WindowAdaptor{}, AdaptorTopIndex{}));
+
+    using BottomTensorCoord =
+        decltype(make_tensor_coordinate(BottomTensorDesc{}, BottomTensorIndex{}));
+
+    struct traits
+    {
+        private:
+        // return vector dimension among [y0, y1, ...]
+        CK_TILE_DEVICE static constexpr auto get_window_adaptor_ys_safe_vector_length_strides()
+        {
+            // bottom tensor top dimension vector lengths and strides
+            const auto [bottom_tensor_top_dim_vector_lengths,
+                        bottom_tensor_top_dim_vector_strides] =
+                BottomTensorDesc::get_top_dimension_safe_vector_length_strides();
+
+            // window vector lengths/strides
+            const auto window_adaptor_bottom_dim_vector_lengths =
+                bottom_tensor_top_dim_vector_lengths;
+            const auto window_adaptor_bottom_dim_vector_strides =
+                bottom_tensor_top_dim_vector_strides;
+
+            // window adaptor [p0, p1, ..., y0, y1, ...]
+            array<index_t, WindowAdaptor::get_num_of_hidden_dimension()>
+                window_adaptor_vector_lengths{-1};
+            array<index_t, WindowAdaptor::get_num_of_hidden_dimension()>
+                window_adaptor_vector_strides{-1};
+
+            constexpr auto window_adaptor_bottom_dims =
+                WindowAdaptor::get_bottom_dimension_hidden_ids();
+
+            set_container_subset(window_adaptor_vector_lengths,
+                                 window_adaptor_bottom_dims,
+                                 window_adaptor_bottom_dim_vector_lengths);
+            set_container_subset(window_adaptor_vector_strides,
+                                 window_adaptor_bottom_dims,
+                                 window_adaptor_bottom_dim_vector_strides);
+
+            const auto [window_adaptor_ps_ys_vector_lengths, window_adaptor_ps_ys_vector_strides] =
+                WindowAdaptor{}.get_top_dimension_safe_vector_length_strides(
+                    window_adaptor_vector_lengths, window_adaptor_vector_strides);
+
+            // [y0, y1, ...]
+            constexpr auto y_dims =
+                typename arithmetic_sequence_gen<TileDstr::get_num_of_dimension_p(),
+                                                 NDimWindowAdaptorTop,
+                                                 1>::type{};
+
+            return make_tuple(get_container_subset(window_adaptor_ps_ys_vector_lengths, y_dims),
+                              get_container_subset(window_adaptor_ps_ys_vector_strides, y_dims));
+        }
+
+        static constexpr auto get_vector_dim_y_scalar_per_vector()
+        {
+            const auto [ys_vector_lengths, ys_vector_strides] =
+                get_window_adaptor_ys_safe_vector_length_strides();
+
+            index_t VectorDimY_      = 0;
+            index_t ScalarPerVector_ = 1;
+
+            for(index_t i = 0; i < NDimY; ++i)
+            {
+                if(ys_vector_strides[i] == 1 && ys_vector_lengths[i] > ScalarPerVector_)
+                {
+                    ScalarPerVector_ = ys_vector_lengths[i];
+                    VectorDimY_      = i;
+                }
+            }
+
+            return make_tuple(VectorDimY_, ScalarPerVector_);
+        }
+
+        public:
+        static constexpr index_t VectorDimY = get_vector_dim_y_scalar_per_vector().template at<0>();
+        static constexpr index_t ScalarPerVector =
+            get_vector_dim_y_scalar_per_vector().template at<1>();
+
+        using vector_t = thread_buffer<DataType, ScalarPerVector>;
+
+        private:
+        static constexpr auto scalars_per_access_ = [] {
+            constexpr auto scalars_per_access_arr = generate_array(
+                [&](auto i) { return (i == VectorDimY) ? ScalarPerVector : 1; }, number<NDimY>{});
+
+            /// TODO: add non-automatic storage argument support to macro TO_SEQUENCE()
+            constexpr auto NDimY_ = NDimY;
+
+            return TO_SEQUENCE(scalars_per_access_arr, NDimY_);
+        }();
+
+        static constexpr auto get_space_filling_curve()
+        {
+            constexpr auto thread_tensor_lengths_ys =
+                to_sequence(TileDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+            // FIXME: need logic to judge dim access order
+            using DimAccessOrder = typename arithmetic_sequence_gen<0, NDimY, 1>::type;
+
+            return space_filling_curve<decltype(thread_tensor_lengths_ys),
+                                       DimAccessOrder,
+                                       decltype(scalars_per_access_),
+                                       false /*!!! no snaked curve! */>{};
+        }
+
+        public:
+        using SFC_Ys = decltype(get_space_filling_curve());
+
+        static constexpr index_t NumAccess = SFC_Ys::get_num_of_access();
+
+        static_assert(0 < NumAccess, "Wrong! NumAccess should be larger than 0");
+
+        private:
+        static constexpr auto get_num_non_linear_access()
+        {
+            constexpr auto sfc_access_lens = SFC_Ys::access_lengths;
+            using ys_to_rhs_major =
+                typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+
+            constexpr auto non_linear = [&]() {
+                index_t cnt = 1;
+                static_for<0, NDimY, 1>{}([&](auto i_dim_y) {
+                    constexpr auto rhs_major    = ys_to_rhs_major{}[i_dim_y];
+                    constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
+                    if constexpr(LinearBottomDims{}[target_h_dim] == 0)
+                    {
+                        cnt *= sfc_access_lens[i_dim_y];
+                    }
+                });
+                return cnt;
+            }();
+
+            return non_linear;
+        }
+
+        // example:
+        // non_linear_access_map: sequence<0, 0, 0, 0, 1, 1, 1, 1> for 8 access, totally 2 register
+        // used
+        //  -> histogram : sequence<4, 4>
+        //  -> prefixsum : seqneuce<0, 4, 8>
+        // non_linear_access_map: sequence<0, 1, 2, 3, 4, 5, 6, 7> for 8 access, totally 8 register
+        // used, will pre-cache 8
+        //  -> histogram : sequence<1, 1, 1, 1, 1, 1, 1, 1>
+        //  -> prefixsum : seqneuce<0, 1, 2, 3, 4, 5, 6, 7, 8>
+        // non_linear_access_map: sequence<0, 0, 1, 1, 2, 2, 3, 3> for 8 access, totally 4 register
+        // used, will pre-cache 4
+        //  -> histogram : sequence<2, 2, 2, 2>
+        //  -> prefixsum : seqneuce<0, 2, 4, 6, 8>
+        static constexpr auto get_non_linear_access_map()
+        {
+            constexpr auto sfc_access_lens = SFC_Ys::access_lengths;
+            using ys_to_rhs_major =
+                typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+            constexpr auto non_linear_map = [&]() {
+                array<index_t, NumAccess> m_{0};
+                index_t cumulative_len_            = 1;
+                index_t cumulative_non_linear_len_ = 1;
+                static_for<0, NDimY, 1>{}([&](auto i_y) {
+                    constexpr auto i_dim_y       = number<NDimY - i_y - 1>{}; // from right to left
+                    constexpr auto rhs_major     = ys_to_rhs_major{}[i_dim_y];
+                    constexpr auto target_h_dim  = number<rhs_major - 1>{}; // no r dim here!
+                    constexpr auto is_linear_dim = LinearBottomDims{}[target_h_dim];
+
+                    array<index_t, NumAccess> current_m_{0};
+                    constexpr auto current_len_ = sfc_access_lens[i_dim_y];
+
+                    // copy cumulative length as current pattern
+                    for(auto i_ = 0; i_ < cumulative_len_; i_++)
+                    {
+                        current_m_(i_) = m_[i_];
+                    }
+                    for(auto j_ = 0; j_ < current_len_; j_++)
+                    {
+                        auto j_offset_ = is_linear_dim ? 0 : j_ * cumulative_non_linear_len_;
+                        for(auto i_ = 0; i_ < cumulative_len_; i_++)
+                        {
+                            m_(j_ * cumulative_len_ + i_) = current_m_[i_] + j_offset_;
+                        }
+                    }
+                    cumulative_len_ *= current_len_;
+                    if(!is_linear_dim)
+                        cumulative_non_linear_len_ *= current_len_;
+                });
+                return m_;
+            }();
+
+            return TO_SEQUENCE(non_linear_map, NumAccess);
+        }
+
+        static constexpr auto get_non_linear_access_histogram()
+        {
+            constexpr auto m_ = get_non_linear_access_map();
+            // m_.foo();
+
+            constexpr auto r_ =
+                typename arithmetic_sequence_gen<0, get_num_non_linear_access() + 1, 1>::type{};
+
+            constexpr auto h_ = histogram_sorted_sequence(m_, r_);
+
+            return h_;
+        }
+
+        static constexpr auto get_non_linear_access_histogram_prefix_sum()
+        {
+            constexpr auto h_            = get_non_linear_access_histogram();
+            constexpr auto h_prefix_sum_ = prefix_sum_sequence(h_);
+            return h_prefix_sum_;
+        }
+
+        public:
+        static constexpr index_t NumAccess_NonLinear = get_num_non_linear_access();
+        using AccessMap_NonLinear       = decltype(get_non_linear_access_map()); // sequence
+        using AccessHistogram_NonLinear = decltype(get_non_linear_access_histogram());
+        using AccessPrefixSum_NonLinear = decltype(get_non_linear_access_histogram_prefix_sum());
+    };
+
+    static constexpr index_t NumAccess           = traits::NumAccess;
+    static constexpr index_t NumAccess_NonLinear = traits::NumAccess_NonLinear;
+    using AccessMap_NonLinear                    = typename traits::AccessMap_NonLinear;
+    using AccessHistogram_NonLinear              = typename traits::AccessHistogram_NonLinear;
+    using AccessPrefixSum_NonLinear              = typename traits::AccessPrefixSum_NonLinear;
+
+    CK_TILE_DEVICE constexpr tile_window_linear() = default;
+
+    CK_TILE_DEVICE constexpr tile_window_linear(const BottomTensorView& bottom_tensor_view,
+                                                const WindowLengths& window_lengths,
+                                                const BottomTensorIndex& window_origin,
+                                                const TileDstr& tile_distribution)
+        : bottom_tensor_view_{bottom_tensor_view},
+          window_lengths_{window_lengths},
+          window_origin_{window_origin},
+          tile_dstr_{tile_distribution},
+          cached_coords_{},
+          cached_flags_{}
+    {
+        auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
+            tile_distribution.get_ps_ys_to_xs_adaptor(),
+            container_concat(make_tuple(get_warp_id(), get_lane_id()),
+                             generate_tuple([&](auto) { return number<0>{}; }, number<NDimY>{})));
+
+        BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
+            window_origin + window_adaptor_thread_coord_tmp.get_bottom_index();
+
+        auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
+            bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
+
+        // future load/store() calls (might allocate more registers)
+        using SFC_Ys = typename traits::SFC_Ys;
+
+        static_for<0, NumAccess, 1>{}([&](auto i_access) {
+            constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
+            constexpr auto need_save_non_linear_coord =
+                bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
+
+            if constexpr(need_save_non_linear_coord)
+            {
+                cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp;
+            }
+
+            // TODO: need pad_tensor_view to check which dim need use flag to check
+            //      cached flag is independent from non-linear-coord
+            //      but need be updated in move_tile, with proper dims
+            cached_flags_(i_access) = coordinate_has_valid_offset_assuming_top_index_is_valid(
+                bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_coord_tmp);
+
+            if constexpr(i_access != (NumAccess - 1))
+            {
+                constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number
+                constexpr auto idx_diff_ps_ys = container_concat(
+                    generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                    idx_diff_ys);
+
+                move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    window_adaptor_thread_coord_tmp,
+                    bottom_tensor_thread_coord_tmp,
+                    idx_diff_ps_ys);
+            }
+        });
+    }
+
+    CK_TILE_DEVICE static constexpr index_t get_num_of_dimension() { return NDimBottomTensor; }
+
+    CK_TILE_DEVICE static constexpr bool has_static_tile_distribution()
+    {
+        return TileDstr::is_static();
+    }
+
+    CK_TILE_DEVICE constexpr auto get_window_lengths() const { return window_lengths_; }
+
+    CK_TILE_DEVICE constexpr auto get_tile_distribution() const { return tile_dstr_; }
+
+    CK_TILE_DEVICE constexpr auto get_bottom_tensor_view() const { return bottom_tensor_view_; }
+
+    CK_TILE_DEVICE constexpr auto get_window_origin() const { return window_origin_; }
+
+    CK_TILE_DEVICE constexpr void
+    set_bottom_tensor_view_data_ptr(typename BottomTensorView::DataType* data)
+    {
+        bottom_tensor_view_.buf_.p_data_ = data;
+    }
+
+    // move thread's window adaptor coordinate and bottom tensor coordinate
+    // [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...] ==> [x0', x1', ...] ==> [offset]
+    template <typename ATopIndex>
+    CK_TILE_DEVICE void move_window_adaptor_and_bottom_tensor_thread_coordinate(
+        WindowAdaptorCoord& window_adaptor_thread_coord,
+        BottomTensorCoord& bottom_tensor_thread_coord,
+        const ATopIndex& idx_diff_adaptor_top) const
+    {
+        array<index_t, NDimBottomTensor> idx_diff_adaptor_bottom;
+
+        move_tensor_adaptor_coordinate(tile_dstr_.get_ps_ys_to_xs_adaptor(),
+                                       window_adaptor_thread_coord,
+                                       idx_diff_adaptor_top,
+                                       idx_diff_adaptor_bottom);
+
+        move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(),
+                               bottom_tensor_thread_coord,
+                               idx_diff_adaptor_bottom);
+    }
+
+    template <index_t i_access>
+    CK_TILE_DEVICE static constexpr auto get_bottom_linear_coordinate(number<i_access>)
+    {
+        using SFC_Ys          = typename traits::SFC_Ys;
+        constexpr auto idx_ys = SFC_Ys::get_index(number<i_access>{});
+        using ys_to_rhs_major =
+            typename decltype(TileDstr{}.get_static_tile_distribution_encoding())::Ys2RHsMajor;
+
+        constexpr auto modified_idx_ys = generate_tuple(
+            [&](auto i_dim_y) {
+                constexpr auto rhs_major    = ys_to_rhs_major{}[i_dim_y];
+                constexpr auto target_h_dim = number<rhs_major - 1>{}; // no r dim here!
+                if constexpr(LinearBottomDims{}[target_h_dim] == 0)
+                {
+                    return number<0>{};
+                }
+                else
+                {
+                    return number<idx_ys[i_dim_y]>{};
+                }
+            },
+            number<NDimY>{});
+
+        constexpr auto adaptor_ = TileDstr{}.get_ps_ys_to_xs_adaptor();
+        constexpr auto idx_ =
+            container_concat(make_tuple(number<0>{}, number<0>{}), modified_idx_ys);
+
+        return adaptor_.calculate_bottom_index(idx_);
+    }
+
+    template <index_t i_access>
+    CK_TILE_DEVICE static constexpr index_t get_bottom_linear_offset(number<i_access>)
+    {
+        constexpr auto linear_coord = get_bottom_linear_coordinate(number<i_access>{});
+        constexpr auto is_pure_linear_tensor =
+            reduce_on_sequence(LinearBottomDims{}, multiplies{}, number<1>{});
+        if constexpr(is_pure_linear_tensor)
+        {
+            // this case usually is a LDS window, everything is known at compile tile.
+            // we directly use BottomTensorView transform to compute the offset, in case padding
+            auto bottom_tensor_coord =
+                make_tensor_coordinate(BottomTensorView{}.get_tensor_descriptor(), linear_coord);
+            return bottom_tensor_coord.get_offset();
+        }
+        else
+        {
+            // this case usually is a global window, where last dim can be linear
+            // we hack here, that use the original TileDstr to compute the linear offset
+            // ... hoping that there is no extra padding between other dims, which make sense
+            // since that would introduce runtime length (so can't use linear offset)
+            constexpr index_t linear_offset = [&]() {
+                constexpr auto x_idx_ = linear_coord;
+                constexpr auto x_len_ = TileDstr{}.get_lengths();
+                static_assert(x_idx_.size() == x_len_.size());
+                constexpr index_t x_dims_ = x_idx_.size();
+                index_t cu_stride_        = 1;
+                index_t cu_offset_        = 0;
+                static_for<0, x_dims_, 1>{}([&](auto i_) {
+                    auto r_i_ = number<x_dims_ - i_ - 1>{};
+                    cu_offset_ += x_idx_[r_i_] * cu_stride_;
+                    cu_stride_ *= x_len_[r_i_];
+                });
+                return cu_offset_;
+            }();
+            return linear_offset;
+        }
+    }
+
+    CK_TILE_DEVICE constexpr auto get_num_of_access() const { return traits::NumAccess; }
+
+    template <index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load(number<i_access> = {}, bool_constant<oob_conditional_check> = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
+
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess = number<i_access_>{};
+
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
+
+            // read from bottom tensor
+            const vector_t vec_value =
+                get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
+                    bottom_tensor_thread_coord,
+                    linear_offset,
+                    bottom_tensor_flag,
+                    bool_constant<oob_conditional_check>{});
+#if 1
+            // data index [y0, y1, ...]
+            constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
+            // write into distributed tensor
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj];
+                    },
+                    number<NDimY>{});
+
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                dst_tensor.get_thread_buffer().template at<d>() =
+                    vec_value.template get_as<DataType>()[j];
+            });
+#else
+            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+            static_assert(d % traits::ScalarPerVector == 0);
+
+            dst_tensor.get_thread_buffer().template get_as<vector_t>()(
+                number<d / traits::ScalarPerVector>{}) = bit_cast<vector_t>(vec_value);
+#endif
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+
+        return dst_tensor;
+    }
+
+    template <typename DstTile, index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load(DstTile& dst_tensor,
+                             number<i_access>                     = {},
+                             bool_constant<oob_conditional_check> = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        // auto dst_tensor = make_static_distributed_tensor<DataType>(tile_dstr);
+
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess = number<i_access_>{};
+
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            constexpr auto linear_offset = get_bottom_linear_offset(IAccess);
+
+            // read from bottom tensor
+            const vector_t vec_value =
+                get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
+                    bottom_tensor_thread_coord,
+                    linear_offset,
+                    bottom_tensor_flag,
+                    bool_constant<oob_conditional_check>{});
+#if 1
+            // data index [y0, y1, ...]
+            constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
+            // write into distributed tensor
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj];
+                    },
+                    number<NDimY>{});
+
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                dst_tensor.get_thread_buffer().template at<d>() =
+                    vec_value.template get_as<DataType>()[j];
+            });
+#else
+            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+            static_assert(d % traits::ScalarPerVector == 0);
+
+            dst_tensor.get_thread_buffer().template get_as<vector_t>()(
+                number<d / traits::ScalarPerVector>{}) = bit_cast<vector_t>(vec_value);
+#endif
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+
+        return dst_tensor;
+    }
+
+    template <typename DstTile,
+              index_t i_access           = -1,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false>
+    CK_TILE_DEVICE void load_raw(DstTile& dst_tensor,
+                                 number<i_access> = {}, // negative means loop over all num_access
+                                 bool_constant<oob_conditional_check> = {},
+                                 bool_constant<pre_nop>               = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+        static constexpr index_t YElementSize =
+            TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
+        static_assert(YElementSize % traits::ScalarPerVector == 0);
+        using vectorized_tbuf = array<vector_t, YElementSize / traits::ScalarPerVector>;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        auto& dst_vec_tbuf = reinterpret_cast<vectorized_tbuf&>(dst_tensor.get_thread_buffer());
+
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess  = number<i_access_>{};
+            constexpr auto pre_nop_ = [&]() {
+                if constexpr(pre_nop && i_access_ == 0 &&
+                             BottomTensorView::buffer_view::get_address_space() ==
+                                 address_space_enum::global)
+                    return bool_constant<true>{};
+                else
+                    return bool_constant<false>{};
+            }();
+
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+            static_assert(d % traits::ScalarPerVector == 0);
+
+            get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
+                dst_vec_tbuf.template at<d / traits::ScalarPerVector>(),
+                bottom_tensor_thread_coord,
+                linear_offset /**/,
+                bottom_tensor_flag,
+                bool_constant<oob_conditional_check>{},
+                pre_nop_);
+#if CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE || \
+    CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
+            asm volatile(""); // this is starting from rocm-6.2, but same sympton, reuse this flag
+#endif
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    // TODO: currently async load only implemented in inline asm
+    template <typename LdsTileWindow_,
+              index_t i_access           = -1,
+              bool oob_conditional_check = true,
+              bool pre_nop               = false>
+    CK_TILE_DEVICE auto async_load_raw(LdsTileWindow_&& lds_tile,
+                                       number<i_access>                     = {},
+                                       bool_constant<oob_conditional_check> = {},
+                                       bool_constant<pre_nop>               = {}) const
+    {
+        using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
+        using LdsDataType   = typename LdsTileWindow::DataType;
+
+        // currently we only support everything is non linear dim
+        // actually it's not performant if we have linear dim(e.g. fast changing)
+        static_assert(NumAccess_NonLinear == NumAccess);
+        static_assert(BottomTensorView::buffer_view::get_address_space() ==
+                      address_space_enum::global);
+
+        // issues * warps * lanes
+        static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+
+        const index_t size_per_buf =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<0>{}, number<0>{})) *
+            sizeof(LdsDataType);
+
+        const index_t size_per_wave =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<1>{}, number<0>{})) *
+                sizeof(LdsDataType) -
+            size_per_buf;
+
+        const index_t size_per_issue =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<1>{}, number<0>{}, number<0>{})) *
+                sizeof(LdsDataType) -
+            size_per_buf;
+
+        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
+        m0_set_with_memory(m0_init_value); // This should be wave independent
+
+        using vector_t = typename traits::vector_t;
+
+        LdsDataType* smem = lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_;
+
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess  = number<i_access_>{};
+            constexpr auto pre_nop_ = [&]() {
+                if constexpr(pre_nop && i_access_ == 0)
+                    return bool_constant<true>{};
+                else
+                    return bool_constant<false>{};
+            }();
+
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto bottom_tensor_flag         = cached_flags_[IAccess]; // get this flag anyway
+
+            // read from bottom tensor
+            get_bottom_tensor_view().template async_get_vectorized_elements_raw<vector_t>(
+                smem, bottom_tensor_thread_coord, 0, bottom_tensor_flag, pre_nop_);
+
+            // move thread coordinate
+            if constexpr(i_access_ != (NumAccess - 1))
+            {
+                m0_inc_with_memory(size_per_issue);
+            }
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    template <typename LdsTileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto async_load(LdsTileWindow_&& lds_tile,
+                                   number<i_access>                     = {},
+                                   bool_constant<oob_conditional_check> = {}) const
+    {
+        using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
+        using LdsDataType   = typename LdsTileWindow::DataType;
+
+        // currently we only support everything is non linear dim
+        // actually it's not performant if we have linear dim(e.g. fast changing)
+        static_assert(NumAccess_NonLinear == NumAccess);
+        static_assert(BottomTensorView::buffer_view::get_address_space() ==
+                      address_space_enum::global);
+
+        // issues * warps * lanes
+        static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+
+        // TODO: LDS offset is not good for intrinsic based implementation(compiler can't figure out
+        // dependency) hence avoid use offset based solution. size_per_buf should be zero (how to
+        // check?)
+        constexpr index_t size_per_buf =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<0>{}, number<0>{}));
+
+        constexpr index_t size_per_wave =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<0>{}, number<1>{}, number<0>{})) -
+            size_per_buf;
+
+        constexpr index_t size_per_issue =
+            lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+                make_tuple(number<1>{}, number<0>{}, number<0>{})) -
+            size_per_buf;
+
+        const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
+
+        using vector_t = typename traits::vector_t;
+
+        // TODO: we force CK_TILE_LDS_ADDR
+        CK_TILE_LDS_ADDR LdsDataType* smem =
+            lds_tile.get_bottom_tensor_view().get_buffer_view().p_data_ + m0_init_value;
+
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            // read from bottom tensor
+            get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
+                smem,
+                bottom_tensor_thread_coord,
+                0,
+                bottom_tensor_flag,
+                bool_constant<oob_conditional_check>{});
+
+            // move thread coordinate
+            if constexpr(i_access_ != (NumAccess - 1))
+            {
+                smem += size_per_issue; // Note we manually increase the per-issue offset
+            }
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    template <index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE void store(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                              number<i_access>                     = {},
+                              bool_constant<oob_conditional_check> = {}) const
+    {
+
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+
+            // read from distributed tensor
+            vector_t vec_value;
+
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                    },
+                    number<NDimY>{});
+
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                vec_value.template get_as<DataType>()(j) =
+                    dstr_tensor.get_thread_buffer().template at<d>();
+            });
+
+            // write into bottom tensor
+            get_bottom_tensor_view().template set_vectorized_elements<vector_t>(
+                bottom_tensor_thread_coord,
+                linear_offset,
+                bottom_tensor_flag,
+                vec_value,
+                bool_constant<oob_conditional_check>{});
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    template <index_t i_access = -1>
+    CK_TILE_DEVICE void store_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                                  number<i_access> = {}) const
+    {
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr                    = TileDstr{};
+        static constexpr bool oob_conditional_check = true;
+
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+
+            // read from distributed tensor
+            vector_t vec_value;
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                    },
+                    number<NDimY>{});
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                vec_value.template get_as<DataType>()(j) =
+                    dstr_tensor.get_thread_buffer().template at<d>();
+            });
+
+            // write into bottom tensor
+            get_bottom_tensor_view()
+                .template set_vectorized_elements_raw<vector_t, oob_conditional_check>(
+                    bottom_tensor_thread_coord, linear_offset, bottom_tensor_flag, vec_value);
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    template <index_t i_access = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE void update(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                               number<i_access>                     = {},
+                               bool_constant<oob_conditional_check> = {}) const
+    {
+
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+
+            // read from distributed tensor
+            vector_t vec_value;
+
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                    },
+                    number<NDimY>{});
+
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                vec_value.template get_as<DataType>()(j) =
+                    dstr_tensor.get_thread_buffer().template at<d>();
+            });
+
+            // write into bottom tensor
+            get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
+                bottom_tensor_thread_coord,
+                linear_offset,
+                bottom_tensor_flag,
+                vec_value,
+                bool_constant<oob_conditional_check>{});
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    template <index_t i_access = -1, bool oob_conditional_check = true, bool pre_nop = false>
+    CK_TILE_DEVICE void update_raw(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                                   number<i_access>                     = {},
+                                   bool_constant<oob_conditional_check> = {},
+                                   bool_constant<pre_nop>               = {}) const
+    {
+
+        using vector_t = typename traits::vector_t;
+        using SFC_Ys   = typename traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        // loop over thread tensor space [y0, y1, ...]
+        auto issue = [&](auto i_access_) {
+            constexpr auto IAccess          = number<i_access_>{};
+            constexpr auto non_linear_id    = number<AccessMap_NonLinear{}[IAccess]>{};
+            auto bottom_tensor_thread_coord = cached_coords_[non_linear_id];
+            constexpr auto linear_offset    = get_bottom_linear_offset(IAccess);
+            auto bottom_tensor_flag         = cached_flags_[IAccess];
+
+            // data index [y0, y1, ...]
+            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
+
+            // read from distributed tensor
+            vector_t vec_value;
+
+            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+                constexpr auto idx_ys = generate_tuple(
+                    [&](auto jj) {
+                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
+                    },
+                    number<NDimY>{});
+
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                vec_value.template get_as<DataType>()(j) =
+                    dstr_tensor.get_thread_buffer().template at<d>();
+            });
+
+            // write into bottom tensor
+            get_bottom_tensor_view().template update_vectorized_elements_raw<vector_t>(
+                bottom_tensor_thread_coord,
+                linear_offset,
+                bottom_tensor_flag,
+                vec_value,
+                bool_constant<oob_conditional_check>{},
+                bool_constant<pre_nop>{});
+        };
+
+        WINDOW_DISPATCH_ISSUE();
+    }
+
+    // move thread's botom tensor coordiante
+    // [x0', x1', ... ] ==> [offset]
+    // also move window-origin
+    CK_TILE_DEVICE void move(const BottomTensorIndex& step)
+    {
+        window_origin_ += step;
+
+        static_for<0, NumAccess, 1>{}([&](auto i_access) {
+            constexpr auto IAccess       = number<i_access>{};
+            constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
+            constexpr auto need_update_non_linear_coord =
+                bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
+
+            if constexpr(need_update_non_linear_coord)
+            {
+                move_tensor_coordinate(bottom_tensor_view_.get_tensor_descriptor(),
+                                       cached_coords_(non_linear_id),
+                                       step);
+            }
+
+            // move the current coord with linear_coords
+            auto tmp_coords             = cached_coords_[non_linear_id];
+            constexpr auto linear_coord = get_bottom_linear_coordinate(IAccess);
+            move_tensor_coordinate(
+                bottom_tensor_view_.get_tensor_descriptor(), tmp_coords, linear_coord);
+
+            cached_flags_(IAccess) = coordinate_has_valid_offset_assuming_top_index_is_valid(
+                bottom_tensor_view_.get_tensor_descriptor(), tmp_coords);
+        });
+    }
+
+    CK_TILE_DEVICE void set_window_origin(const BottomTensorIndex& new_window_origin)
+    {
+        window_origin_ = new_window_origin;
+
+        auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
+            TileDstr{}.get_ps_ys_to_xs_adaptor(),
+            container_concat(make_tuple(get_warp_id(), get_lane_id()),
+                             generate_tuple([&](auto) { return number<0>{}; }, number<NDimY>{})));
+
+        BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
+            window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
+
+        auto bottom_tensor_thread_coord_tmp = make_tensor_coordinate(
+            bottom_tensor_view_.get_tensor_descriptor(), bottom_tensor_thread_origin_idx_tmp);
+
+        // future load/store() calls (might allocate more registers)
+        using SFC_Ys = typename traits::SFC_Ys;
+
+        static_for<0, NumAccess, 1>{}([&](auto i_access) {
+            constexpr auto non_linear_id = number<AccessMap_NonLinear{}[i_access]>{};
+            constexpr auto need_save_non_linear_coord =
+                bool_constant<AccessPrefixSum_NonLinear{}[non_linear_id] == i_access>{};
+
+            if constexpr(need_save_non_linear_coord)
+            {
+                cached_coords_(non_linear_id) = bottom_tensor_thread_coord_tmp;
+            }
+
+            if constexpr(i_access != (NumAccess - 1))
+            {
+                constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(i_access); // tuple of number
+                constexpr auto idx_diff_ps_ys = container_concat(
+                    generate_tuple([&](auto) { return number<0>{}; }, number<NDimP>{}),
+                    idx_diff_ys);
+
+                move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                    window_adaptor_thread_coord_tmp,
+                    bottom_tensor_thread_coord_tmp,
+                    idx_diff_ps_ys);
+            }
+        });
+    }
+
+    CK_TILE_HOST_DEVICE void init_raw() { bottom_tensor_view_.init_raw(); }
+
+    // this is the bottom tensor view
+    // [x0', x1', ...] ==> [offset]
+    BottomTensorView bottom_tensor_view_;
+
+    //
+    WindowLengths window_lengths_;
+
+    // origin ([x0', x1', ...]) of window on bottom tensor
+    BottomTensorIndex window_origin_;
+
+    // Tile tensor distribution, which contains:
+    //   1. adaptor for window: [p0, p1, ..., y0, y1, ...] ==> [x0, x1, ...]
+    //   2. thread descriptor for thread tensor in register: [y0, y1, ...] ==> [d]
+    TileDstr tile_dstr_;
+
+    // this contains:
+    array<BottomTensorCoord, traits::NumAccess_NonLinear> cached_coords_;
+    array<bool, traits::NumAccess> cached_flags_;
+};
+
+#undef WINDOW_DISPATCH_ISSUE
+
+namespace impl {
+template <address_space_enum, index_t len_>
+struct default_linear_bottom_dims_impl
+{
+    using type = typename uniform_sequence_gen<len_, 0>::type;
+};
+
+template <index_t len_>
+struct default_linear_bottom_dims_impl<address_space_enum::global, len_>
+{
+    // global default to seq<0,0,....1>
+    using type = typename sequence_merge<typename uniform_sequence_gen<len_ - 1, 0>::type,
+                                         sequence<1>>::type;
+};
+
+template <index_t len_>
+struct default_linear_bottom_dims_impl<address_space_enum::lds, len_>
+{
+    // lds default to seq<1,1.....1>
+    using type = typename uniform_sequence_gen<len_, 1>::type;
+};
+} // namespace impl
+
+template <typename TensorView_>
+using default_linear_bottom_dims =
+    typename impl::default_linear_bottom_dims_impl<TensorView_::buffer_view::get_address_space(),
+                                                   TensorView_::get_num_of_dimension()>::type;
+
+// if using this API, will create a tile_window_linear
+// this structure can have the chance to use immediate value, save register
+// need pass in LinearBottomDims_ properly to control which dim is linear
+// so to generate a constexpr offset as linear_offset for this dim
+// (and finally pass to the immediate offset of buffer/lds instruction)
+//
+// Note: there is no internal check for which dim is OK to use linear offset
+// user must make sure by themselves
+//
+// e.g.
+// 2d global matrix, set LinearBottomDims_=seq<0, 1>, the last dim will generate
+// immediate offset if each thread has multiple issue along last dim
+//
+// 2d LDS buffer, set LinearBottomDims_=seq<1, 1>, then only one vgpr used as offset
+// everything else is just using immediate offset.
+//
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
+CK_TILE_DEVICE constexpr auto
+make_tile_window_linear(const TensorView_& tensor_view,
+                        const WindowLengths_& window_lengths,
+                        const multi_index<TensorView_::get_num_of_dimension()>& origin,
+                        const StaticTileDistribution_& tile_distribution,
+                        LinearBottomDims_ = {})
+{
+    static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
+    return tile_window_linear<remove_cvref_t<TensorView_>,
+                              remove_cvref_t<WindowLengths_>,
+                              remove_cvref_t<StaticTileDistribution_>,
+                              remove_cvref_t<LinearBottomDims_>>{
+        tensor_view, window_lengths, origin, tile_distribution};
+}
+
+template <
+    typename TileWindow_,
+    typename StaticTileDistribution_,
+    typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
+CK_TILE_DEVICE constexpr auto
+make_tile_window_linear(const TileWindow_& tile_window,
+                        const StaticTileDistribution_& tile_distribution,
+                        LinearBottomDims_ = {})
+{
+    return make_tile_window_linear(tile_window.get_bottom_tensor_view(),
+                                   tile_window.get_window_lengths(),
+                                   tile_window.get_window_origin(),
+                                   tile_distribution,
+                                   LinearBottomDims_{});
+}
+
+// this version must not be called under a constexpr context
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_ = default_linear_bottom_dims<TensorView_>>
+CK_TILE_DEVICE auto
+make_tile_window_linear_raw(const TensorView_& tensor_view,
+                            const WindowLengths_& window_lengths,
+                            const multi_index<TensorView_::get_num_of_dimension()>& origin,
+                            const StaticTileDistribution_& tile_distribution,
+                            LinearBottomDims_ = {})
+{
+    static_assert(LinearBottomDims_::size() == TensorView_::get_num_of_dimension());
+    auto w = tile_window_linear<remove_cvref_t<TensorView_>,
+                                remove_cvref_t<WindowLengths_>,
+                                remove_cvref_t<StaticTileDistribution_>,
+                                remove_cvref_t<LinearBottomDims_>>{
+        tensor_view, window_lengths, origin, tile_distribution};
+    w.init_raw();
+    return w;
+}
+
+template <
+    typename TileWindow_,
+    typename StaticTileDistribution_,
+    typename LinearBottomDims_ = default_linear_bottom_dims<typename TileWindow_::BottomTensorView>>
+CK_TILE_DEVICE constexpr auto
+make_tile_window_linear_raw(const TileWindow_& tile_window,
+                            const StaticTileDistribution_& tile_distribution,
+                            LinearBottomDims_ = {})
+{
+    return make_tile_window_linear_raw(tile_window.get_bottom_tensor_view(),
+                                       tile_window.get_window_lengths(),
+                                       tile_window.get_window_origin(),
+                                       tile_distribution,
+                                       LinearBottomDims_{});
+}
+
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          typename LinearBottomDims_>
+CK_TILE_DEVICE void move_tile_window(
+    tile_window_linear<TensorView_, WindowLengths_, StaticTileDistribution_, LinearBottomDims_>&
+        window,
+    const typename tile_window_linear<TensorView_,
+                                      WindowLengths_,
+                                      StaticTileDistribution_,
+                                      LinearBottomDims_>::BottomTensorIndex& step)
+{
+    window.move(step);
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/core/tensor/tile_window_utils.hpp
+++ b/include/ck_tile/core/tensor/tile_window_utils.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/core/arch/utility.hpp"
+#include "ck_tile/core/algorithm/space_filling_curve.hpp"
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/container/array.hpp"
+#include "ck_tile/core/container/sequence.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/tensor/static_distributed_tensor.hpp"
+#include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/tensor/tile_distribution.hpp"
+#include "ck_tile/core/utility/functional.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+#pragma once
+namespace ck_tile {
+
+// input a lds store tile, extract some information from it
+// used to set m0 value for gfx9 serious
+template <typename LdsTileWindow_>
+CK_TILE_DEVICE auto get_async_store_smem_info(LdsTileWindow_&& lds_tile)
+{
+    using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
+    using LdsDataType   = typename LdsTileWindow::DataType;
+
+    // issues * warps * lanes
+    static_assert(LdsTileWindow::get_num_of_dimension() == 3); // TODO: hard coded
+
+    const index_t size_per_buf =
+        lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+            make_tuple(number<0>{}, number<0>{}, number<0>{})) *
+        sizeof(LdsDataType);
+
+    const index_t size_per_wave =
+        lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+            make_tuple(number<0>{}, number<1>{}, number<0>{})) *
+            sizeof(LdsDataType) -
+        size_per_buf;
+
+    const index_t size_per_issue =
+        lds_tile.get_bottom_tensor_view().get_tensor_descriptor().calculate_offset(
+            make_tuple(number<1>{}, number<0>{}, number<0>{})) *
+            sizeof(LdsDataType) -
+        size_per_buf;
+
+    const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
+
+    return make_tuple(m0_init_value, size_per_issue);
+}
+
+} // namespace ck_tile