Merge remote-tracking branch 'origin/develop' into aosewski/ggemm_multi_d2

129e58ae · Adam Osewski · 9bebfd42 · cb0645be · 129e58ae · 129e58ae
Commit 129e58ae authored Jun 05, 2024 by Adam Osewski
20 changed files
--- a/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
+++ b/include/ck_tile/core/arch/generic_memory_space_atomic.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "ck_tile/core/numeric/vector_type.hpp"
+#include "ck_tile/core/numeric/type_convert.hpp"
+#include "ck_tile/core/container/thread_buffer.hpp"
+
+namespace ck_tile {
+
+CK_TILE_HOST_DEVICE bf16_t add_bf16_t(const bf16_t& a, const bf16_t& b)
+{
+    return type_convert<bf16_t>(type_convert<float>(a) + type_convert<float>(b));
+}
+
+CK_TILE_HOST_DEVICE bf16x2_t add_bf16x2_t(const bf16x2_t& a, const bf16x2_t& b)
+{
+    bf16x2_t rtn;
+    rtn[0] = add_bf16_t(a[0], b[0]);
+    rtn[1] = add_bf16_t(a[1], b[1]);
+    return rtn;
+}
+
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to make the implementation of atomic_add explicit for
+// each datatype.
+template <typename X>
+CK_TILE_DEVICE void atomic_add(X* p_dst, const X& x);
+
+template <>
+CK_TILE_DEVICE void atomic_add<bf16x2_t>(bf16x2_t* p_dst, const bf16x2_t& x)
+{
+    union U32BF162_ADDR
+    {
+        uint32_t* u32_a;
+        bf16x2_t* bf162_a;
+    };
+
+    union U32BF162
+    {
+        uint32_t u32;
+        bf16x2_t bf162;
+    };
+
+    U32BF162_ADDR dword_addr;
+    U32BF162 cur_v;
+    U32BF162 new_;
+    uint32_t old_v, new_v;
+    dword_addr.bf162_a = p_dst;
+    cur_v.u32          = *dword_addr.u32_a;
+
+    do
+    {
+        old_v      = cur_v.u32;
+        new_.bf162 = add_bf16x2_t(cur_v.bf162, x);
+        new_v      = new_.u32;
+        cur_v.u32  = atomicCAS(dword_addr.u32_a, old_v, new_v);
+    } while(cur_v.u32 != old_v);
+}
+
+template <typename T, index_t N>
+CK_TILE_DEVICE void atomic_add_g(T* p_dst, const thread_buffer<T, N>& x)
+{
+    static_assert((std::is_same<T, int32_t>::value && (N == 1)) ||
+                      (std::is_same<T, uint32_t>::value && (N == 1)) ||
+                      (std::is_same<T, float>::value && (N == 1 || N == 2)) ||
+                      (std::is_same<T, double>::value && (N == 1 || N == 2)) ||
+                      (std::is_same<T, bf16_t>::value && (N == 2 || N == 4)),
+                  "wrong! not implemented");
+
+    constexpr auto I0 = number<0>{};
+    constexpr auto I1 = number<1>{};
+
+    if constexpr(std::is_same<T, float>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicAdd(p_dst, bit_cast<float>(x));
+        }
+        else if constexpr(N == 2)
+        {
+            atomicAdd(c_style_pointer_cast<float*>(p_dst), x.template get_as<float>()[I0]);
+            atomicAdd(c_style_pointer_cast<float*>(p_dst) + 1, x.template get_as<float>()[I1]);
+        }
+    }
+    else if constexpr(std::is_same<T, double>::value)
+    {
+        if constexpr(N == 1)
+        {
+            return atomicAdd(p_dst, bit_cast<double>(x));
+        }
+        else if constexpr(N == 2)
+        {
+            atomicAdd(c_style_pointer_cast<double*>(p_dst), x.template get_as<double>()[I0]);
+            atomicAdd(c_style_pointer_cast<double*>(p_dst) + 1, x.template get_as<double>()[I1]);
+        }
+    }
+    else if constexpr(std::is_same<T, int32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicAdd(p_dst, bit_cast<int32_t>(x));
+        }
+    }
+    else if constexpr(std::is_same<T, uint32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicAdd(p_dst, bit_cast<uint32_t>(x));
+        }
+    }
+    else if constexpr(std::is_same<T, bf16_t>::value)
+    {
+        if constexpr(N == 2)
+        {
+            atomic_add(c_style_pointer_cast<bf16x2_t*>(p_dst), bit_cast<bf16x2_t>(x));
+        }
+        else if constexpr(N == 4)
+        {
+            atomic_add(c_style_pointer_cast<bf16x2_t*>(p_dst), x.template get_as<bf16x2_t>()[I0]);
+            atomic_add(c_style_pointer_cast<bf16x2_t*>(p_dst) + 1,
+                       x.template get_as<bf16x2_t>()[I1]);
+        }
+    }
+}
+
+template <typename T, index_t N>
+CK_TILE_DEVICE void atomic_max_g(T* p_dst, const thread_buffer<T, N>& x)
+{
+    static_assert((std::is_same<T, int32_t>::value && (N == 1)) ||
+                      (std::is_same<T, uint32_t>::value && (N == 1)) ||
+                      (std::is_same<T, float>::value && (N == 1 || N == 2)) ||
+                      (std::is_same<T, double>::value && (N == 1)),
+                  "wrong! not implemented");
+
+    constexpr auto I0 = number<0>{};
+    constexpr auto I1 = number<1>{};
+
+    if constexpr(std::is_same<T, float>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicMax(p_dst, bit_cast<float>(x));
+        }
+        else if constexpr(N == 2)
+        {
+            atomicMax(c_style_pointer_cast<float*>(p_dst), x.template get_as<float>()[I0]);
+            atomicMax(c_style_pointer_cast<float*>(p_dst) + 1, x.template get_as<float>()[I1]);
+        }
+    }
+    else if constexpr(std::is_same<T, double>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicMax(p_dst, bit_cast<double>(x));
+        }
+    }
+    else if constexpr(std::is_same<T, int32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicMax(p_dst, bit_cast<int32_t>(x));
+        }
+    }
+    else if constexpr(std::is_same<T, uint32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            atomicMax(p_dst, bit_cast<uint32_t>(x));
+        }
+    }
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/core/config.hpp
+++ b/include/ck_tile/core/config.hpp
@@ -3,6 +3,21 @@

 #pragma once

+#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
+    defined(__gfx942__)
+#define __gfx9__
+#endif
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#define __gfx94__
+#endif
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__)
+#define __gfx103__
+#endif
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
+#define __gfx11__
+#endif
+
 #ifndef CK_TILE_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
@@ -109,15 +124,13 @@
 // buffer atomic add: floating point
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
-#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
-    defined(__gfx942__) // for GPU code
+#elif defined(__gfx9__) // for GPU code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
 #else // for GPU code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
 #endif

-#if(defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
-    defined(__gfx942__)) // for GPU code
+#if(defined(__gfx90a__) || defined(__gfx94__)) // for GPU code
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
 #else
 #define CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
@@ -137,13 +150,12 @@

 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0xffffffff
-#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
-    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) ||                          \
-    defined(__gfx942__) // for GPU code
+#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || \
+    defined(__gfx9__) // for GPU code
 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x00020000
-#elif defined(__gfx1030__) // for GPU code
+#elif defined(__gfx103__) // for GPU code
 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31014000
-#elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code
+#elif defined(__gfx11__) // for GPU code
 #define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31004000
 #endif

@@ -159,3 +171,7 @@
 #ifndef CK_TILE_FMHA_FWD_FAST_EXP2
 #define CK_TILE_FMHA_FWD_FAST_EXP2 0
 #endif
+
+#ifndef CK_TILE_BUFFER_LOAD_RAW_BF16_WA
+#define CK_TILE_BUFFER_LOAD_RAW_BF16_WA 1
+#endif
--- a/include/ck_tile/core/numeric/float8.hpp
+++ b/include/ck_tile/core/numeric/float8.hpp
@@ -55,7 +55,7 @@ struct alignas(1) float8_e4m3_t
 {
    static constexpr int exponent = 4;
    static constexpr int mantissa = 3;
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    static constexpr int bias = 1 << (exponent - 1); // NANOO
 #else
    static constexpr int bias = (1 << (exponent - 1)) - 1; // IEEE
@@ -113,7 +113,7 @@ struct alignas(1) float8_e5m2_t
 {
    static constexpr int exponent = 5;
    static constexpr int mantissa = 2;
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    static constexpr int bias = 1 << (exponent - 1); // NANOO
 #else
    static constexpr int bias = (1 << (exponent - 1)) - 1; // IEEE
@@ -470,7 +470,7 @@ CK_TILE_HOST_DEVICE fp8_raw_t float_to_fp8_sr_raw(float x)
 {
    constexpr int seed = 42;
    uint32_t rng       = prand_generator_t<float, seed>{}(reinterpret_cast<uintptr_t>(&x), x);
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    float max_fp8 = 240.0f;
    x             = x > max_fp8 ? max_fp8 : (x < -max_fp8 ? -max_fp8 : x);
    union
@@ -500,7 +500,7 @@ CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_sr_raw(float x)
 {
    constexpr int seed = 42;
    uint32_t rng       = prand_generator_t<float, seed>{}(reinterpret_cast<uintptr_t>(&x), x);
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    union
    {
        float fval;
@@ -526,7 +526,7 @@ CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_sr_raw(float x)

 CK_TILE_HOST_DEVICE fp8_raw_t float_to_fp8_rtn_raw(float x)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    float max_fp8 = 240.0f;
    x             = x > max_fp8 ? max_fp8 : (x < -max_fp8 ? -max_fp8 : x);
    union
@@ -554,7 +554,7 @@ CK_TILE_HOST_DEVICE fp8_raw_t float_to_fp8_rtn_raw(float x)
 }
 CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_rtn_raw(float x)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    union
    {
        float fval;
@@ -598,7 +598,7 @@ CK_TILE_HOST_DEVICE bf8_raw_t float_to_bf8_raw(float x, constant<rounding>)

 CK_TILE_HOST_DEVICE float fp8_to_float_raw(fp8_raw_t x)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    float fval;
    uint32_t i32val = static_cast<uint32_t>(x);
    fval            = __builtin_amdgcn_cvt_f32_fp8(i32val, 0);
@@ -612,7 +612,7 @@ CK_TILE_HOST_DEVICE float fp8_to_float_raw(fp8_raw_t x)

 CK_TILE_HOST_DEVICE float bf8_to_float_raw(bf8_raw_t x)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    float fval;
    uint32_t i32val = static_cast<uint32_t>(x);
    fval            = __builtin_amdgcn_cvt_f32_bf8(i32val, 0);
@@ -656,7 +656,7 @@ struct numeric_traits<fp8_t>
 {
    static constexpr int exp  = 4;
    static constexpr int mant = 3;
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    static constexpr int bias = 8;
 #else
    static constexpr int bias = 7;
@@ -668,7 +668,7 @@ struct numeric_traits<bf8_t>
 {
    static constexpr int exp  = 5;
    static constexpr int mant = 2;
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    static constexpr int bias = 16;
 #else
    static constexpr int bias = 15; // IEEE

--- a/include/ck_tile/core/numeric/half.hpp
+++ b/include/ck_tile/core/numeric/half.hpp
@@ -129,8 +129,8 @@ constexpr double fp16_to_double_hip(const fp16_hip_t& x)
 CK_TILE_HOST_DEVICE
 constexpr fp16_hip_t float_to_fp16_hip(const float& x)
 {
-    return __float2half(x);
-    // return static_cast<fp16_hip_t>(x);
+    // return __float2half(x);
+    return static_cast<fp16_hip_t>(x);
 }

 CK_TILE_HOST_DEVICE

--- a/include/ck_tile/core/numeric/integral_constant.hpp
+++ b/include/ck_tile/core/numeric/integral_constant.hpp
@@ -56,7 +56,6 @@ CK_TILE_LEFT_UNARY_OP(+)
 CK_TILE_LEFT_UNARY_OP(-)
 CK_TILE_LEFT_UNARY_OP(~)
 CK_TILE_LEFT_UNARY_OP(!)
-CK_TILE_LEFT_UNARY_OP(*)

 CK_TILE_BINARY_OP(+)
 CK_TILE_BINARY_OP(-)

--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -144,6 +144,15 @@ using int8x16_t = int8_t __attribute((ext_vector_type(16)));
 using int8x32_t = int8_t __attribute((ext_vector_type(32)));
 using int8x64_t = int8_t __attribute((ext_vector_type(64)));

+// ui8
+// using uint8_t
+using uint8x2_t  = uint8_t __attribute((ext_vector_type(2)));
+using uint8x4_t  = uint8_t __attribute((ext_vector_type(4)));
+using uint8x8_t  = uint8_t __attribute((ext_vector_type(8)));
+using uint8x16_t = uint8_t __attribute((ext_vector_type(16)));
+using uint8x32_t = uint8_t __attribute((ext_vector_type(32)));
+using uint8x64_t = uint8_t __attribute((ext_vector_type(64)));
+
 #if CK_TILE_USE_CUSTOM_DATA_TYPE
 // f8
 // using fp8_t

--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -6,6 +6,7 @@
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
+#include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
 #include "ck_tile/core/container/array.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
@@ -507,10 +508,10 @@ struct buffer_view<address_space_enum::global,
        bool constexpr use_amd_buffer_addressing = false;
 #endif

+        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
        if constexpr(use_amd_buffer_addressing)
        {
-            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
            amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
                x, p_data_, i, is_valid_element, buffer_size_);
        }
@@ -518,7 +519,7 @@ struct buffer_view<address_space_enum::global,
        {
            if(is_valid_element)
            {
-                atomic_add<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
+                atomic_add_g<remove_cvref_t<T>, t_per_x>(&p_data_[i], x);
            }
        }
    }
@@ -547,16 +548,16 @@ struct buffer_view<address_space_enum::global,
        bool constexpr use_amd_buffer_addressing = false;
 #endif

+        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
        if constexpr(use_amd_buffer_addressing)
        {
-            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
            amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
                x, p_data_, i, is_valid_element, buffer_size_);
        }
        else if(is_valid_element)
        {
-            atomic_max<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
+            atomic_max_g<remove_cvref_t<T>, t_per_x>(&p_data_[i], x);
        }
    }


--- a/include/ck_tile/core/tensor/store_tile.hpp
+++ b/include/ck_tile/core/tensor/store_tile.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -16,7 +16,9 @@

 namespace ck_tile {

-template <typename BufferView_, typename TensorDesc_>
+template <typename BufferView_,
+          typename TensorDesc_,
+          memory_operation_enum DstInMemOp_ = memory_operation_enum::set>
 struct tensor_view
 {
    using buffer_view = remove_reference_t<BufferView_>;
@@ -24,6 +26,7 @@ struct tensor_view
    using TensorDesc  = remove_cvref_t<TensorDesc_>;
    using TensorIndex = array<index_t, TensorDesc::get_num_of_top_dimension()>;
    using TensorCoord = decltype(make_tensor_coordinate(TensorDesc{}, TensorIndex{}));
+    static constexpr auto DstInMemOp = DstInMemOp_;

    CK_TILE_HOST_DEVICE constexpr tensor_view() = default;

@@ -140,6 +143,23 @@ struct tensor_view
            x);
    }

+    // X is vector of DataType.
+    // "coord" is coordinate of DataType, not X. "coord" should be aligned to X
+    template <typename X,
+              bool oob_conditional_check = true,
+              typename std::enable_if<
+                  std::is_same_v<typename vector_traits<remove_cvref_t<X>>::scalar_type,
+                                 typename vector_traits<remove_cvref_t<DataType>>::scalar_type>,
+                  bool>::type = false>
+    CK_TILE_HOST_DEVICE constexpr void update_vectorized_elements(
+        const TensorCoord& coord, const X& x, bool_constant<oob_conditional_check> = {})
+    {
+        buf_.template update<DstInMemOp, X, oob_conditional_check>(
+            coord.get_offset(),
+            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
+            x);
+    }
+
    CK_TILE_HOST_DEVICE void print() const
    {
        printf("tensor_view{");
@@ -178,6 +198,7 @@ CK_TILE_HOST_DEVICE constexpr auto make_tensor_view(DataType* p,
 }

 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
+          memory_operation_enum DstInMemOp      = memory_operation_enum::set,
          typename DataType,
          typename... Lengths,
          typename... Strides,
@@ -198,7 +219,7 @@ make_naive_tensor_view(DataType* p,

    auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.get_element_space_size());

-    return tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
+    return tensor_view<decltype(buffer_view), decltype(desc), DstInMemOp>{buffer_view, desc};
 }

 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
@@ -232,8 +253,9 @@ CK_TILE_HOST_DEVICE constexpr auto transform_tensor_view(const OldTensorView& ol
                                                NewLowerDimensionOldVisibleIdss{},
                                                NewUpperDimensionNewVisibleIdss{});

-    return tensor_view<typename OldTensorView::buffer_view, remove_cvref_t<decltype(new_desc)>>{
-        old_tensor_view.buf_, new_desc};
+    return tensor_view<typename OldTensorView::buffer_view,
+                       remove_cvref_t<decltype(new_desc)>,
+                       remove_cvref_t<OldTensorView>::DstInMemOp>{old_tensor_view.buf_, new_desc};
 }

 template <typename TensorView,

--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core/container/sequence.hpp"
 #include "ck_tile/core/container/tuple.hpp"
 #include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/container/meta_data_buffer.hpp"
 #include "ck_tile/core/tensor/tensor_adaptor.hpp"
 #include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
 #include "ck_tile/core/utility/functional.hpp"

--- a/include/ck_tile/core/tensor/tile_elementwise.hpp
+++ b/include/ck_tile/core/tensor/tile_elementwise.hpp
@@ -112,7 +112,7 @@ namespace impl {
 template <typename OutDataType, typename InTensor>
 CK_TILE_DEVICE auto cast_tile_pk_fp8x4(const InTensor& in_dstr_tensors)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx94__)
    // This API is designed to use the _pk_ serious of function
    constexpr auto in_tile_dstr = InTensor::get_tile_distribution();


--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -594,6 +594,66 @@ struct tile_window_with_static_distribution
        });
    }

+    template <bool oob_conditional_check = true>
+    CK_TILE_DEVICE void update(const static_distributed_tensor<DataType, TileDstr>& dstr_tensor,
+                               bool_constant<oob_conditional_check> = {}) const
+    {
+        using Traits = load_store_traits;
+
+        using vector_t = typename Traits::vector_t;
+        using SFC_Ys   = typename Traits::SFC_Ys;
+
+        constexpr auto tile_dstr = TileDstr{};
+
+        // loop over thread tensor space [y0, y1, ...]
+        static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+            /// TODO: use structure binding (to be captured later) if compiled in C++20
+            auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
+            auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
+
+            static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
+                constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
+
+                // data index [y0, y1, ...]
+                constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
+
+                // read from distributed tensor
+                vector_t vec_value;
+
+                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                    constexpr auto idx_ys = generate_array(
+                        [&](auto jj) {
+                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
+                                                            : idx_ys_start[jj];
+                        },
+                        number<NDimY>{});
+
+                    constexpr index_t d =
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+
+                    vec_value.template get_as<DataType>()(j) =
+                        dstr_tensor.get_thread_buffer().template at<d>();
+                });
+
+                // write into bottom tensor
+                get_bottom_tensor_view().template update_vectorized_elements<vector_t>(
+                    bottom_tensor_thread_coord, vec_value, bool_constant<oob_conditional_check>{});
+
+                // move thread coordinate
+                if constexpr(iCoordAccess != (NumAccessPerCoord - 1))
+                {
+                    constexpr auto idx_diff_ys = SFC_Ys::get_forward_step(iAccess);
+
+                    constexpr auto idx_diff_ps_ys =
+                        container_concat(array<index_t, NDimP>{0}, idx_diff_ys);
+
+                    move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                        window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+                }
+            });
+        });
+    }
+
    // move thread's botom tensor coordiante
    // [x0', x1', ... ] ==> [offset]
    // also move window-origin

--- a/include/ck_tile/core/tensor/update_tile.hpp
+++ b/include/ck_tile/core/tensor/update_tile.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/algorithm/coordinate_transform.hpp"
+#include "ck_tile/core/container/container_helper.hpp"
+#include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/tensor/tile_window.hpp"
+#include "ck_tile/core/utility/type_traits.hpp"
+
+namespace ck_tile {
+
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename DataType_>
+CK_TILE_DEVICE void
+update_tile(tile_window_with_static_lengths<BottomTensorView_, WindowLengths_>& tile_window_tmp,
+            const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
+{
+    using DataType = remove_cvref_t<typename BottomTensorView_::DataType>;
+    using TileDstr = remove_cvref_t<TileDistribution_>;
+
+    static_assert(std::is_same_v<remove_cvref_t<DataType_>, DataType>, "wrong!");
+
+    constexpr auto tile_dstr = TileDstr{};
+
+    auto tile_window = make_tile_window(tile_window_tmp.get_bottom_tensor_view(),
+                                        tile_window_tmp.get_window_lengths(),
+                                        tile_window_tmp.get_window_origin(),
+                                        tile_dstr);
+
+    tile_window.update(dstr_tensor);
+}
+
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          index_t NumCoord,
+          typename DataType_>
+CK_TILE_DEVICE void
+update_tile(tile_window_with_static_distribution<BottomTensorView_,
+                                                 WindowLengths_,
+                                                 TileDistribution_,
+                                                 NumCoord>& tile_window,
+            const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor)
+{
+    tile_window.update(dstr_tensor);
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/core/utility/philox_rand.hpp
+++ b/include/ck_tile/core/utility/philox_rand.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+
+namespace ck_tile {
+
+// Reference: https://github.com/Dao-AILab/flash-attention/blob/main/csrc/flash_attn/src/philox.cuh
+class philox
+{
+    public:
+    CK_TILE_HOST_DEVICE philox(unsigned long long seed_, unsigned long long offset_)
+        : seed(reinterpret_cast<const uint2&>(seed_))
+    {
+
+        ull2* tmp = reinterpret_cast<ull2*>(&counter);
+        tmp->x    = offset_;
+    }
+
+    CK_TILE_HOST_DEVICE uint4 get_philox_4x32(const unsigned long long subsequence) const
+    {
+
+        uint4 counter_ = counter;
+        ull2* tmp      = reinterpret_cast<ull2*>(&counter_);
+        tmp->y         = subsequence;
+
+        uint2 key_ = seed;
+// 7-round philox
+#pragma unroll
+        for(int i = 0; i < 6; i++)
+        {
+            counter_ = philox_single_round(counter_, key_);
+            key_.x += kPhilox10A;
+            key_.y += kPhilox10B;
+        }
+        uint4 output = philox_single_round(counter_, key_);
+        return output;
+    }
+
+    CK_TILE_HOST_DEVICE void get_random_16x8(uint8_t* out,
+                                             const unsigned long long subsequence) const
+    {
+        uint4 tmp_ph;
+        tmp_ph = get_philox_4x32(subsequence);
+
+        uint32_t* out_tmp = reinterpret_cast<uint32_t*>(&out[0]);
+
+        out_tmp[0] = tmp_ph.x;
+        out_tmp[1] = tmp_ph.y;
+        out_tmp[2] = tmp_ph.z;
+        out_tmp[3] = tmp_ph.w;
+    }
+
+    private:
+    struct ull2
+    {
+        uint64_t x;
+        uint64_t y;
+    };
+    uint4 counter;
+    const uint2 seed;
+
+    CK_TILE_HOST_DEVICE uint2 mulhilo32(const unsigned int a, const unsigned int b) const
+    {
+        uint2* res;
+        unsigned long long tmp;
+        tmp = static_cast<unsigned long long>(a) * b;
+        res = reinterpret_cast<uint2*>(&tmp);
+        return *res;
+    }
+
+    CK_TILE_HOST_DEVICE uint4 philox_single_round(const uint4 ctr, const uint2 key) const
+    {
+
+        uint2 res0 = mulhilo32(kPhiloxSA, ctr.x);
+        uint2 res1 = mulhilo32(kPhiloxSB, ctr.z);
+        uint4 ret  = {res1.y ^ ctr.y ^ key.x, res1.x, res0.y ^ ctr.w ^ key.y, res0.x};
+        return ret;
+    }
+
+    static const unsigned long kPhilox10A = 0x9E3779B9;
+    static const unsigned long kPhilox10B = 0xBB67AE85;
+    static const unsigned long kPhiloxSA  = 0xD2511F53;
+    static const unsigned long kPhiloxSB  = 0xCD9E8D57;
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
@@ -11,6 +11,7 @@
 #include "ck_tile/host/host_tensor.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/host/ranges.hpp"
+#include "ck_tile/host/reference/reference_batched_dropout.hpp"
 #include "ck_tile/host/reference/reference_batched_elementwise.hpp"
 #include "ck_tile/host/reference/reference_batched_gemm.hpp"
 #include "ck_tile/host/reference/reference_batched_masking.hpp"
@@ -20,3 +21,4 @@
 #include "ck_tile/host/reference/reference_reduce.hpp"
 #include "ck_tile/host/reference/reference_softmax.hpp"
 #include "ck_tile/host/stream_config.hpp"
+#include "ck_tile/host/timer.hpp"
--- a/include/ck_tile/host/device_memory.hpp
+++ b/include/ck_tile/host/device_memory.hpp
@@ -27,7 +27,14 @@ struct DeviceMem
    DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {}
    DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
    {
-        HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+        if(mMemSize != 0)
+        {
+            HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+        }
+        else
+        {
+            mpDeviceBuf = nullptr;
+        }
    }
    void Realloc(std::size_t mem_size)
    {
@@ -36,7 +43,14 @@ struct DeviceMem
            HIP_CHECK_ERROR(hipFree(mpDeviceBuf));
        }
        mMemSize = mem_size;
-        HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+        if(mMemSize != 0)
+        {
+            HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+        }
+        else
+        {
+            mpDeviceBuf = nullptr;
+        }
    }
    void* GetDeviceBuffer() const { return mpDeviceBuf; }
    std::size_t GetBufferSize() const { return mMemSize; }
@@ -47,15 +61,18 @@ struct DeviceMem
            HIP_CHECK_ERROR(
                hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
        }
-        else
-        {
-            throw std::runtime_error("ToDevice with an empty pointer");
-        }
+        // else
+        // {
+        //     throw std::runtime_error("ToDevice with an empty pointer");
+        // }
    }
    void ToDevice(const void* p, const std::size_t cpySize) const
    {
-        HIP_CHECK_ERROR(
-            hipMemcpy(mpDeviceBuf, const_cast<void*>(p), cpySize, hipMemcpyHostToDevice));
+        if(mpDeviceBuf)
+        {
+            HIP_CHECK_ERROR(
+                hipMemcpy(mpDeviceBuf, const_cast<void*>(p), cpySize, hipMemcpyHostToDevice));
+        }
    }
    void FromDevice(void* p) const
    {
@@ -63,14 +80,17 @@ struct DeviceMem
        {
            HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
        }
-        else
-        {
-            throw std::runtime_error("FromDevice with an empty pointer");
-        }
+        // else
+        // {
+        //     throw std::runtime_error("FromDevice with an empty pointer");
+        // }
    }
    void FromDevice(void* p, const std::size_t cpySize) const
    {
-        HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
+        if(mpDeviceBuf)
+        {
+            HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, cpySize, hipMemcpyDeviceToHost));
+        }
    }
    void SetZero() const
    {
@@ -82,13 +102,16 @@ struct DeviceMem
    template <typename T>
    void SetValue(T x) const
    {
-        if(mMemSize % sizeof(T) != 0)
+        if(mpDeviceBuf)
        {
-            throw std::runtime_error("wrong! not entire DeviceMem will be set");
-        }
+            if(mMemSize % sizeof(T) != 0)
+            {
+                throw std::runtime_error("wrong! not entire DeviceMem will be set");
+            }

-        // TODO: call a gpu kernel to set the value (?)
-        set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
+            // TODO: call a gpu kernel to set the value (?)
+            set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
+        }
    }
    ~DeviceMem()
    {

--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -156,7 +156,7 @@ struct HostTensorDescriptor
    }

    const std::vector<std::size_t>& get_lengths() const { return mLens; }
-    const std::vector<std::size_t>& GetStrides() const { return mStrides; }
+    const std::vector<std::size_t>& get_strides() const { return mStrides; }

    template <typename... Is>
    std::size_t GetOffsetFromMultiIndex(Is... is) const
@@ -188,7 +188,7 @@ CK_TILE_HOST HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old
    for(std::size_t i = 0; i < a.get_num_of_dimension(); i++)
    {
        new_lengths[i] = a.get_lengths()[new2old[i]];
-        new_strides[i] = a.GetStrides()[new2old[i]];
+        new_strides[i] = a.get_strides()[new2old[i]];
    }

    return HostTensorDescriptor(new_lengths, new_strides);
@@ -327,7 +327,7 @@ struct HostTensor

    decltype(auto) get_lengths() const { return mDesc.get_lengths(); }

-    decltype(auto) GetStrides() const { return mDesc.GetStrides(); }
+    decltype(auto) get_strides() const { return mDesc.get_strides(); }

    std::size_t get_num_of_dimension() const { return mDesc.get_num_of_dimension(); }

@@ -481,6 +481,34 @@ struct HostTensor
        return mData[mDesc.GetOffsetFromMultiIndex(idx)];
    }

+    HostTensor<T> transpose(std::vector<size_t> axes = {}) const
+    {
+        if(axes.empty())
+        {
+            axes.resize(this->get_num_of_dimension());
+            std::iota(axes.rbegin(), axes.rend(), 0);
+        }
+        if(axes.size() != mDesc.get_num_of_dimension())
+        {
+            throw std::runtime_error(
+                "HostTensor::transpose(): size of axes must match tensor dimension");
+        }
+        std::vector<size_t> tlengths, tstrides;
+        for(const auto& axis : axes)
+        {
+            tlengths.push_back(get_lengths()[axis]);
+            tstrides.push_back(get_strides()[axis]);
+        }
+        HostTensor<T> ret(*this);
+        ret.mDesc = HostTensorDescriptor(tlengths, tstrides);
+        return ret;
+    }
+
+    HostTensor<T> transpose(std::vector<size_t> axes = {})
+    {
+        return const_cast<HostTensor<T> const*>(this)->transpose(axes);
+    }
+
    typename Data::iterator begin() { return mData.begin(); }

    typename Data::iterator end() { return mData.end(); }

--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -6,6 +6,7 @@
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/hip_check_error.hpp"
+#include "ck_tile/host/timer.hpp"
 #include <hip/hip_runtime.h>
 #include <cstddef>

@@ -14,153 +15,92 @@ template <int MaxThreadPerBlock, int MinBlockPerCu, typename Kernel, typename...
 #if CK_TILE_USE_LAUNCH_BOUNDS
 __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
 #endif
-    __global__ void kentry(Kernel f, Args... args)
+    __global__ void kentry(Args... args)
 {
-    f(args...);
+    Kernel{}(args...);
 }

-template <typename... Args, typename F>
-CK_TILE_HOST float launch_and_time_kernel(const stream_config& s,
-                                          F kernel,
-                                          dim3 grid_dim,
-                                          dim3 block_dim,
-                                          std::size_t lds_byte,
-                                          Args... args)
+//
+// return a anonymous functor(lambda) to be called later
+// the KernelImpl should be a class without non-static data member, or let's say
+// can be instantiate with "KernelImpl{}"
+//
+// the "static __device__ operator()(some_arg)" is the entry point of KernelImpl
+//
+template <int MaxThreadPerBlock = CK_TILE_MAX_THREAD_PER_BLOCK,
+          int MinBlockPerCu     = CK_TILE_MIN_BLOCK_PER_CU,
+          typename KernelImpl,
+          typename... Args>
+CK_TILE_HOST auto
+make_kernel(KernelImpl /*f*/, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
 {
-#if CK_TILE_TIME_KERNEL
-    if(s.time_kernel_)
-    {
-        // warm up
-        for(int i = 0; i < s.cold_niters_; ++i)
-        {
-            kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
-            hip_check_error(hipGetLastError());
-        }
-
-        const int nrepeat = s.nrepeat_;
-        hipEvent_t start, stop;
-
-        HIP_CHECK_ERROR(hipEventCreate(&start));
-        HIP_CHECK_ERROR(hipEventCreate(&stop));
-
-        HIP_CHECK_ERROR(hipDeviceSynchronize());
-        HIP_CHECK_ERROR(hipEventRecord(start, s.stream_id_));
-
-        for(int i = 0; i < nrepeat; ++i)
-        {
-            kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
-            hip_check_error(hipGetLastError());
-        }
-
-        HIP_CHECK_ERROR(hipEventRecord(stop, s.stream_id_));
-        HIP_CHECK_ERROR(hipEventSynchronize(stop));
-
-        float total_time = 0;
-
-        HIP_CHECK_ERROR(hipEventElapsedTime(&total_time, start, stop));
+    const auto kernel = kentry<MaxThreadPerBlock, MinBlockPerCu, KernelImpl, Args...>;

-        return total_time / nrepeat;
-    }
-    else
-    {
+    return [=](const stream_config& s) {
        kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
-        hip_check_error(hipGetLastError());
-        return 0;
-    }
-#else
-    kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
-    hip_check_error(hipGetLastError());
-    return 0;
-#endif
+    };
 }

-template <typename... Args, typename F, typename PreProcessFunc>
-CK_TILE_HOST float launch_and_time_kernel_with_preprocess(const stream_config& s,
-                                                          PreProcessFunc preprocess,
-                                                          F kernel,
-                                                          dim3 grid_dim,
-                                                          dim3 block_dim,
-                                                          std::size_t lds_byte,
-                                                          Args... args)
+// clang-format off
+/*
+ * launch_kernel()
+ *
+ * this is the function to launch arbitrary number of kernels with optional timer(selected by stream_config)
+ * the callables should have signature as "operator()(const stream_config& s){ ... }" to call
+ * 
+ * the simplest way is pass in a lambda function, with "[=](const stream_config& s){ call_your_kernel_here() }"
+ * as signature, for the callable (pay attention to the capture list)
+ * 
+ * e.g.
+ *  ck_tile::launch_kernel(s,
+ *                      [=](const stream_config& s){ hipMemset(ptr, 0, size) },
+ *                      [=](const stream_config& s){ some_kernel<<<grids, blocks>>>(arg); }
+ *                      );
+ * 
+ * if you use ck_tile kernel, or similiar to this style (structure with "static __device__ operator()(...){}")
+ * you can pass your kernel to ck_tile::make_kernel(), which will create a anonymous functor for you,
+ * then pass it to ck_tile::launch_kernel()
+ * 
+ * e.g.
+ *  ck_tile::launch_kernel(s,
+ *                      ck_tile::make_kernel<T0, B0>(kernel_0{}, grids0, blocks0, 0, kargs0),
+ *                      ck_tile::make_kernel<T0, B1>(kernel_1{}, grids1, blocks1, 0, kargs1),
+ *                       ...);
+ **/
+// clang-format on
+template <typename... Callables>
+CK_TILE_HOST float launch_kernel(const stream_config& s, Callables... callables)
 {
-#if CK_TILE_TIME_KERNEL
-    if(s.time_kernel_)
-    {
-#if CK_TILE_DEBUG_LOG
-        printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
-               __func__,
-               grid_dim.x,
-               grid_dim.y,
-               grid_dim.z,
-               block_dim.x,
-               block_dim.y,
-               block_dim.z);
-
-        printf("Warm up 1 time\n");
-#endif
-        // warm up
-        preprocess();
-        kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
-        hip_check_error(hipGetLastError());
-
-        const int nrepeat = 10;
-#if CK_TILE_DEBUG_LOG
-        printf("Start running %d times...\n", nrepeat);
-#endif
-        hipEvent_t start, stop;
-
-        HIP_CHECK_ERROR(hipEventCreate(&start));
-        HIP_CHECK_ERROR(hipEventCreate(&stop));
-
-        HIP_CHECK_ERROR(hipDeviceSynchronize());
-        HIP_CHECK_ERROR(hipEventRecord(start, s.stream_id_));
+    // clang-format off
+    if(!s.time_kernel_) {
+        (callables(s),...); hip_check_error(hipGetLastError());
+        return 0;
+    }
+    if(s.is_gpu_timer_) {
+        gpu_timer timer {};

-        for(int i = 0; i < nrepeat; ++i)
-        {
-            preprocess();
-            kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
-            hip_check_error(hipGetLastError());
-        }
+        // warmup
+        for(int i = 0; i < s.cold_niters_; i++) { (callables(s),...); } hip_check_error(hipGetLastError());

-        HIP_CHECK_ERROR(hipEventRecord(stop, s.stream_id_));
-        HIP_CHECK_ERROR(hipEventSynchronize(stop));
+        timer.start(s.stream_id_);
+        for(int i = 0; i < s.nrepeat_; i++) { (callables(s),...); } hip_check_error(hipGetLastError());
+        timer.stop(s.stream_id_);

-        float total_time = 0;
+        return timer.duration() / s.nrepeat_;
+    }
+    else {
+        cpu_timer timer {};

-        HIP_CHECK_ERROR(hipEventElapsedTime(&total_time, start, stop));
+        // warmup
+        for(int i = 0; i < s.cold_niters_; i++) { (callables(s),...); } hip_check_error(hipGetLastError());

-        return total_time / nrepeat;
-    }
-    else
-    {
-        preprocess();
-        kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
-        hip_check_error(hipGetLastError());
+        timer.start(s.stream_id_);
+        for(int i = 0; i < s.nrepeat_; i++) { (callables(s),...); } hip_check_error(hipGetLastError());
+        timer.stop(s.stream_id_);

-        return 0;
+        return timer.duration() / s.nrepeat_;
    }
-#else
-    kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);
-    hip_check_error(hipGetLastError());
-
-    return 0;
-#endif
+    // clang-format on
 }

-template <int MaxThreadPerBlock = CK_TILE_MAX_THREAD_PER_BLOCK,
-          int MinBlockPerCu     = CK_TILE_MIN_BLOCK_PER_CU,
-          typename KernelImpl,
-          typename... Args>
-CK_TILE_HOST float launch_kernel(const stream_config& s,
-                                 KernelImpl kernel_impl,
-                                 dim3 grid_dim,
-                                 dim3 block_dim,
-                                 std::size_t dynamic_smem_byte,
-                                 Args... args)
-{
-    const auto kernel = kentry<MaxThreadPerBlock, MinBlockPerCu, KernelImpl, Args...>;
-
-    return launch_and_time_kernel(
-        s, kernel, grid_dim, block_dim, dynamic_smem_byte, kernel_impl, args...);
-}
 } // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_batched_dropout.hpp
+++ b/include/ck_tile/host/reference/reference_batched_dropout.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename DataType, typename RandValOutputDataType>
+CK_TILE_HOST void reference_batched_dropout(HostTensor<DataType>& in_out_b_m_n,
+                                            const HostTensor<RandValOutputDataType>& randval_b_m_n,
+                                            const uint8_t& p_undrop_in_uint8_t,
+                                            const float scale)
+{
+    const int N = in_out_b_m_n.mDesc.get_lengths()[2];
+    auto f      = [&](auto batch, auto m) {
+        for(int n = 0; n < N; ++n)
+        {
+            float tmp = ck_tile::type_convert<float>(in_out_b_m_n(batch, m, n)) * scale;
+            in_out_b_m_n(batch, m, n) = randval_b_m_n(batch, m, n) <= p_undrop_in_uint8_t
+                                                 ? ck_tile::type_convert<DataType>(tmp)
+                                                 : DataType(0);
+        }
+    };
+
+    make_ParallelTensorFunctor(
+        f, randval_b_m_n.mDesc.get_lengths()[0], randval_b_m_n.mDesc.get_lengths()[1])(
+        std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/stream_config.hpp
+++ b/include/ck_tile/host/stream_config.hpp
@@ -6,6 +6,22 @@
 #include <hip/hip_runtime.h>

 namespace ck_tile {
+/*
+ * construct this structure with behavior as:
+ *
+ *   // create stream config with default stream(NULL), and not timing the kernel
+ *   stream_config s = stream_config{};
+ *
+ *   // create stream config with _some_stream_id_, and not timing the kernel
+ *   stream_config s = stream_config{_some_stream_id_};
+ *
+ *   // create stream config with _some_stream_id_, and benchmark with warmup/repeat as default
+ *   stream_config s = stream_config{_some_stream_id_, true};
+ *
+ *   // create stream config with _some_stream_id_, and benchmark using cpu timer
+ *   stream_config s = stream_config{_some_stream_id_, true, 0, 3, 10, false};
+ **/
+
 struct stream_config
 {
    hipStream_t stream_id_ = nullptr;
@@ -13,5 +29,6 @@ struct stream_config
    int log_level_         = 0;
    int cold_niters_       = 3;
    int nrepeat_           = 10;
+    bool is_gpu_timer_     = true; // keep compatible
 };
 } // namespace ck_tile