Merge branch 'develop' of https://github.com/ROCm/composable_kernel into update_cka8w8_uc

1b616990 · aska-0096 · af30d6b6 · 800cf897 · af30d6b6 · 1b616990
Commit 1b616990 authored Feb 05, 2025 by aska-0096
20 changed files
--- a/include/ck_tile/core/utility/amd_address_space.hpp
+++ b/include/ck_tile/core/utility/amd_address_space.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core/config.hpp"
-
-// Address Space for AMDGCN
-// https://llvm.org/docs/AMDGPUUsage.html#address-space
-
-namespace ck_tile {
-
-#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
-
-template <typename T>
-__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p)
-{
-    // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
-    // only c-style pointer cast seems be able to be compiled
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wold-style-cast"
-    return (T*)p; // NOLINT(old-style-cast)
-#pragma clang diagnostic pop
-}
-
-template <typename T>
-__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p)
-{
-    // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
-    // only c-style pointer cast seems be able to be compiled
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wold-style-cast"
-    return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
-#pragma clang diagnostic pop
-}
-
-} // namespace ck_tile
--- a/include/ck_tile/core/utility/type_traits.hpp
+++ b/include/ck_tile/core/utility/type_traits.hpp
@@ -109,4 +109,22 @@ CK_TILE_HOST_DEVICE PY c_style_pointer_cast(PX p_x)
 #pragma clang diagnostic pop
 }

+template <typename CompareTo, typename... Rest>
+struct is_any_of : std::false_type
+{
+};
+
+template <typename CompareTo, typename FirstType>
+struct is_any_of<CompareTo, FirstType> : std::is_same<CompareTo, FirstType>
+{
+};
+
+template <typename CompareTo, typename FirstType, typename... Rest>
+struct is_any_of<CompareTo, FirstType, Rest...>
+    : std::integral_constant<bool,
+                             std::is_same<CompareTo, FirstType>::value ||
+                                 is_any_of<CompareTo, Rest...>::value>
+{
+};
+
 } // namespace ck_tile
--- a/include/ck_tile/core/utility/unary_element_function.hpp
+++ b/include/ck_tile/core/utility/unary_element_function.hpp
@@ -51,16 +51,18 @@ struct composes<F>
 template <typename... Ts>
 __host__ __device__ composes(Ts&&...)->composes<remove_cvref_t<Ts>...>;

-template <typename To>
+template <typename SaturateType>
 struct saturates
 {
-    template <typename From>
-    CK_TILE_HOST_DEVICE constexpr auto operator()(const From& from) const
-        -> std::enable_if_t<std::is_arithmetic_v<From>, From>
+    // NOTE: this function does not return SaturateType value
+    // it is user's responsiblity to do further cast or not
+    template <typename AccType>
+    CK_TILE_HOST_DEVICE constexpr auto operator()(const AccType& a_) const
+        -> std::enable_if_t<std::is_arithmetic_v<AccType>, AccType>
    {
-        return clamp(from,
-                     type_convert<From>(numeric<To>::lowest()),
-                     type_convert<From>(numeric<To>::max()));
+        return clamp(a_,
+                     type_convert<AccType>(numeric<SaturateType>::lowest()),
+                     type_convert<AccType>(numeric<SaturateType>::max()));
    }
 };


--- a/include/ck_tile/host.hpp
+++ b/include/ck_tile/host.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -34,3 +34,4 @@
 #include "ck_tile/host/reference/reference_topk.hpp"
 #include "ck_tile/host/stream_config.hpp"
 #include "ck_tile/host/timer.hpp"
+#include "ck_tile/host/reference/reference_batched_transpose.hpp"
--- a/include/ck_tile/host/check_err.hpp
+++ b/include/ck_tile/host/check_err.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -18,6 +18,112 @@

 namespace ck_tile {

+template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
+double get_relative_threshold(const int number_of_accumulations = 1)
+{
+    using F8   = ck_tile::fp8_t;
+    using F16  = ck_tile::half_t;
+    using BF16 = ck_tile::bf16_t;
+    using F32  = float;
+    using I8   = int8_t;
+    using I32  = int32_t;
+
+    static_assert(is_any_of<ComputeDataType, F8, F16, BF16, F32, I8, I32, int>::value,
+                  "Warning: Unhandled ComputeDataType for setting up the relative threshold!");
+
+    double compute_error = 0;
+    if constexpr(is_any_of<ComputeDataType, I8, I32, int>::value)
+    {
+        return 0;
+    }
+    else
+    {
+        compute_error = std::pow(2, -numeric_traits<ComputeDataType>::mant) * 0.5;
+    }
+
+    static_assert(is_any_of<OutDataType, F8, F16, BF16, F32, I8, I32, int>::value,
+                  "Warning: Unhandled OutDataType for setting up the relative threshold!");
+
+    double output_error = 0;
+    if constexpr(is_any_of<OutDataType, I8, I32, int>::value)
+    {
+        return 0;
+    }
+    else
+    {
+        output_error = std::pow(2, -numeric_traits<OutDataType>::mant) * 0.5;
+    }
+    double midway_error = std::max(compute_error, output_error);
+
+    static_assert(is_any_of<AccDataType, F8, F16, BF16, F32, I8, I32, int>::value,
+                  "Warning: Unhandled AccDataType for setting up the relative threshold!");
+
+    double acc_error = 0;
+    if constexpr(is_any_of<AccDataType, I8, I32, int>::value)
+    {
+        return 0;
+    }
+    else
+    {
+        acc_error = std::pow(2, -numeric_traits<AccDataType>::mant) * 0.5 * number_of_accumulations;
+    }
+    return std::max(acc_error, midway_error);
+}
+
+template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
+double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1)
+{
+    using F8   = ck_tile::fp8_t;
+    using F16  = ck_tile::half_t;
+    using BF16 = ck_tile::bf16_t;
+    using F32  = float;
+    using I8   = int8_t;
+    using I32  = int32_t;
+
+    static_assert(is_any_of<ComputeDataType, F8, F16, BF16, F32, I8, I32, int>::value,
+                  "Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
+
+    auto expo            = std::log2(std::abs(max_possible_num));
+    double compute_error = 0;
+    if constexpr(is_any_of<ComputeDataType, I8, I32, int>::value)
+    {
+        return 0;
+    }
+    else
+    {
+        compute_error = std::pow(2, expo - numeric_traits<ComputeDataType>::mant) * 0.5;
+    }
+
+    static_assert(is_any_of<OutDataType, F8, F16, BF16, F32, I8, I32, int>::value,
+                  "Warning: Unhandled OutDataType for setting up the absolute threshold!");
+
+    double output_error = 0;
+    if constexpr(is_any_of<OutDataType, I8, I32, int>::value)
+    {
+        return 0;
+    }
+    else
+    {
+        output_error = std::pow(2, expo - numeric_traits<OutDataType>::mant) * 0.5;
+    }
+    double midway_error = std::max(compute_error, output_error);
+
+    static_assert(is_any_of<AccDataType, F8, F16, BF16, F32, I8, I32, int>::value,
+                  "Warning: Unhandled AccDataType for setting up the absolute threshold!");
+
+    double acc_error = 0;
+    if constexpr(is_any_of<AccDataType, I8, I32, int>::value)
+    {
+        return 0;
+    }
+    else
+    {
+        acc_error =
+            std::pow(2, expo - numeric_traits<AccDataType>::mant) * 0.5 * number_of_accumulations;
+    }
+    return std::max(acc_error, midway_error);
+}
+
 template <typename T>
 std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
 {
@@ -337,7 +443,11 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
    }
    if(!res)
    {
-        std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+        const float error_percent =
+            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
+        std::cerr << "max err: " << max_err;
+        std::cerr << ", number of errors: " << err_count;
+        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
    }
    return res;
 }

--- a/include/ck_tile/host/convolution_host_tensor_descriptor_helper.hpp
+++ b/include/ck_tile/host/convolution_host_tensor_descriptor_helper.hpp
@@ -14,57 +14,41 @@ namespace detail {
 template <typename OldLayout>
 CK_TILE_HOST std::vector<std::size_t> get_layout_transpose_gnchw_to_old()
 {
-    if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNCW> ||
-                 std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GKCX> ||
-                 std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNKW>)
+    using namespace ck_tile::tensor_layout::convolution;
+
+    if constexpr(is_any_of<OldLayout, GNCW, GKCX, GNKW>::value)
    {
        return {0, 1, 2, 3};
    }
-    else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNCHW> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GKCYX> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNKHW>)
+    else if constexpr(is_any_of<OldLayout, GNCHW, GKCYX, GNKHW>::value)
    {
        return {0, 1, 2, 3, 4};
    }
-    else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNCDHW> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GKCZYX> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNKDHW>)
+    else if constexpr(is_any_of<OldLayout, GNCDHW, GKCZYX, GNKDHW>::value)
    {
        return {0, 1, 2, 3, 4, 5};
    }
-    if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNWC> ||
-                 std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GKXC> ||
-                 std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNWK>)
+    if constexpr(is_any_of<OldLayout, GNWC, GKXC, GNWK>::value)
    {
        return {0, 1, 3, 2};
    }
-    else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNHWC> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GKYXC> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNHWK>)
+    else if constexpr(is_any_of<OldLayout, GNHWC, GKYXC, GNHWK>::value)
    {
        return {0, 1, 4, 2, 3};
    }
-    else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNDHWC> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GKZYXC> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNDHWK>)
+    else if constexpr(is_any_of<OldLayout, GNDHWC, GKZYXC, GNDHWK>::value)
    {
        return {0, 1, 5, 2, 3, 4};
    }
-    else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::NWGC> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::KXGC> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::NWGK>)
+    else if constexpr(is_any_of<OldLayout, NWGC, KXGC, NWGK>::value)
    {
        return {2, 0, 3, 1};
    }
-    else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::NHWGC> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::KYXGC> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::NHWGK>)
+    else if constexpr(is_any_of<OldLayout, NHWGC, KYXGC, NHWGK>::value)
    {
        return {3, 0, 4, 1, 2};
    }
-    else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::NDHWGC> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::KZYXGC> ||
-                      std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::NDHWGK>)
+    else if constexpr(is_any_of<OldLayout, NDHWGC, KZYXGC, NDHWGK>::value)
    {
        return {4, 0, 5, 1, 2, 3};
    }
@@ -83,11 +67,11 @@ template <typename InLayout>
 CK_TILE_HOST HostTensorDescriptor
 make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvParam& param)
 {
+    using namespace ck_tile::tensor_layout::convolution;
+
    std::vector<std::size_t> physical_lengths;

-    if constexpr(std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::GNCW> ||
-                 std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::GNCHW> ||
-                 std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::GNCDHW>)
+    if constexpr(is_any_of<InLayout, GNCW, GNCHW, GNCDHW>::value)
    {
        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
                                                    static_cast<std::size_t>(param.N_),
@@ -97,9 +81,7 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvPara
                                param.input_spatial_lengths_.begin(),
                                param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
    }
-    else if constexpr(std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::GNWC> ||
-                      std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::GNHWC> ||
-                      std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::GNDHWC>)
+    else if constexpr(is_any_of<InLayout, GNWC, GNHWC, GNDHWC>::value)
    {
        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
                                                    static_cast<std::size_t>(param.N_),
@@ -109,9 +91,7 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvPara
                                param.input_spatial_lengths_.begin(),
                                param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
    }
-    else if constexpr(std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::NWGC> ||
-                      std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::NHWGC> ||
-                      std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::NDHWGC>)
+    else if constexpr(is_any_of<InLayout, NWGC, NHWGC, NDHWGC>::value)
    {
        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
                                                    static_cast<std::size_t>(param.G_),
@@ -139,11 +119,11 @@ template <typename WeiLayout>
 CK_TILE_HOST HostTensorDescriptor
 make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvParam& param)
 {
+    using namespace ck_tile::tensor_layout::convolution;
+
    std::vector<std::size_t> physical_lengths;

-    if constexpr(std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::KXC> ||
-                 std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::KYXC> ||
-                 std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::KZYXC>)
+    if constexpr(is_any_of<WeiLayout, KXC, KYXC, KZYXC>::value)
    {
        if(param.G_ != 1)
        {
@@ -157,9 +137,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
                                param.filter_spatial_lengths_.begin(),
                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
    }
-    else if constexpr(std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::GKCX> ||
-                      std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::GKCYX> ||
-                      std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::GKCZYX>)
+    else if constexpr(is_any_of<WeiLayout, GKCX, GKCYX, GKCZYX>::value)
    {
        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
                                                    static_cast<std::size_t>(param.K_),
@@ -169,9 +147,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
                                param.filter_spatial_lengths_.begin(),
                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
    }
-    else if constexpr(std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::GKXC> ||
-                      std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::GKYXC> ||
-                      std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::GKZYXC>)
+    else if constexpr(is_any_of<WeiLayout, GKXC, GKYXC, GKZYXC>::value)
    {
        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
                                                    static_cast<std::size_t>(param.K_),
@@ -181,9 +157,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
                                param.filter_spatial_lengths_.begin(),
                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
    }
-    else if constexpr(std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::KXGC> ||
-                      std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::KYXGC> ||
-                      std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::KZYXGC>)
+    else if constexpr(is_any_of<WeiLayout, KXGC, KYXGC, KZYXGC>::value)
    {
        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
                                                    static_cast<std::size_t>(param.G_),
@@ -211,11 +185,11 @@ template <typename OutLayout>
 CK_TILE_HOST HostTensorDescriptor
 make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvParam& param)
 {
+    using namespace ck_tile::tensor_layout::convolution;
+
    std::vector<std::size_t> physical_lengths;

-    if constexpr(std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::GNKW> ||
-                 std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::GNKHW> ||
-                 std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::GNKDHW>)
+    if constexpr(is_any_of<OutLayout, GNKW, GNKHW, GNKDHW>::value)
    {
        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
                                                    static_cast<std::size_t>(param.N_),
@@ -226,9 +200,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvPar
                                param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
    }
    // separate from legacy code above
-    else if constexpr(std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::GNWK> ||
-                      std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::GNHWK> ||
-                      std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::GNDHWK>)
+    else if constexpr(is_any_of<OutLayout, GNWK, GNHWK, GNDHWK>::value)
    {
        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
                                                    static_cast<std::size_t>(param.N_),
@@ -238,9 +210,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvPar
                                param.output_spatial_lengths_.begin(),
                                param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
    }
-    else if constexpr(std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::NWGK> ||
-                      std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::NHWGK> ||
-                      std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::NDHWGK>)
+    else if constexpr(is_any_of<OutLayout, NWGK, NHWGK, NDHWGK>::value)
    {
        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
                                                    static_cast<std::size_t>(param.G_),

--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -678,4 +678,43 @@ struct HostTensor
    Descriptor mDesc;
    Data mData;
 };
+
+template <bool is_row_major>
+auto host_tensor_descriptor(std::size_t row,
+                            std::size_t col,
+                            std::size_t stride,
+                            bool_constant<is_row_major>)
+{
+    using namespace ck_tile::literals;
+
+    if constexpr(is_row_major)
+    {
+        return HostTensorDescriptor({row, col}, {stride, 1_uz});
+    }
+    else
+    {
+        return HostTensorDescriptor({row, col}, {1_uz, stride});
+    }
+}
+template <bool is_row_major>
+auto get_default_stride(std::size_t row,
+                        std::size_t col,
+                        std::size_t stride,
+                        bool_constant<is_row_major>)
+{
+    if(stride == 0)
+    {
+        if constexpr(is_row_major)
+        {
+            return col;
+        }
+        else
+        {
+            return row;
+        }
+    }
+    else
+        return stride;
+}
+
 } // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_batched_transpose.hpp
+++ b/include/ck_tile/host/reference/reference_batched_transpose.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+#include <thread>
+
+namespace ck_tile {
+
+template <typename Type>
+CK_TILE_HOST void reference_batched_transpose(const HostTensor<Type>& x,
+                                              HostTensor<Type>& y,
+                                              std::string layout_in  = "NCHW",
+                                              std::string layout_out = "NHWC")
+{
+    const int N = x.mDesc.get_lengths()[0];
+
+    auto f = [&](auto batch) {
+        if(layout_in == "NCHW" && layout_out == "NHWC")
+        {
+            const int C = x.mDesc.get_lengths()[1];
+            const int H = x.mDesc.get_lengths()[2];
+            const int W = x.mDesc.get_lengths()[3];
+            for(int c = 0; c < C; ++c)
+            {
+                for(int h = 0; h < H; ++h)
+                {
+                    for(int w = 0; w < W; ++w)
+                    {
+                        Type v_x          = x(batch, c, h, w);
+                        y(batch, h, w, c) = v_x;
+                    }
+                }
+            }
+        }
+        else if(layout_in == "NHWC" && layout_out == "NCHW")
+        {
+            const int H = x.mDesc.get_lengths()[1];
+            const int W = x.mDesc.get_lengths()[2];
+            const int C = x.mDesc.get_lengths()[3];
+            for(int h = 0; h < H; ++h)
+            {
+                for(int w = 0; w < W; ++w)
+                {
+                    for(int c = 0; c < C; ++c)
+                    {
+                        Type v_x          = x(batch, h, w, c);
+                        y(batch, c, h, w) = v_x;
+                    }
+                }
+            }
+        }
+    };
+
+    make_ParallelTensorFunctor(f, N)(std::thread::hardware_concurrency());
+}
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_fused_moe.hpp
+++ b/include/ck_tile/host/reference/reference_fused_moe.hpp
@@ -73,7 +73,7 @@ void reference_fused_moe(
    ck_tile::index_t tokens,
    ck_tile::index_t experts,
    ck_tile::index_t hidden_size,
-    ck_tile::index_t intermediate_size, // this size is for gate/up
+    ck_tile::index_t intermediate_size, // this size is for gate/up/down
    ck_tile::index_t topk,
    ck_tile::index_t gate_only)
 {
@@ -82,19 +82,8 @@ void reference_fused_moe(
    assert(sorted_expert_ids_host.get_num_of_dimension() == 1);
    assert(num_sorted_tiles_host.get_element_size() == 1);
    ck_tile::index_t num_sorted_tiles    = num_sorted_tiles_host.mData[0] / block_m;
-    ck_tile::index_t intermediate_size_0 = intermediate_size;
-    ck_tile::index_t intermediate_size_1 = intermediate_size / (gate_only ? 1 : 2);
-
-    // TODO: better remove this in the future, or modify the token_id value
-    auto get_topk_id = [&](ck_tile::index_t token_id_, ck_tile::index_t expert_id_) {
-        for(ck_tile::index_t i_ = 0; i_ < topk; i_++)
-        {
-            if(token_ids_host(token_id_, i_) == expert_id_)
-                return i_;
-        }
-        throw std::runtime_error("not correct token/expert pair\n");
-        return -1; // TODO: not correct!!
-    };
+    ck_tile::index_t intermediate_size_0 = intermediate_size * (gate_only ? 1 : 2);
+    ck_tile::index_t intermediate_size_1 = intermediate_size;

    ck_tile::HostTensor<AccDataType> out_topk_tokens({tokens, topk, hidden_size});

@@ -105,11 +94,31 @@ void reference_fused_moe(
        if(i_tile >= num_sorted_tiles)
            return;
        ck_tile::index_t i_expert = sorted_expert_ids_host.mData[i_tile];
-        ck_tile::index_t i_token  = sorted_token_ids_host.mData[i_flatten];
+
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+        ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
+        ck_tile::index_t i_topk  = i_token >> 24;
+        i_token &= 0xffffff;
+        if(i_token >= tokens)
+            return;
+        (void)token_ids_host;
+#else
+        // TODO: better remove this in the future, or modify the token_id value
+        auto get_topk_id = [&](ck_tile::index_t token_id_, ck_tile::index_t expert_id_) {
+            for(ck_tile::index_t i_ = 0; i_ < topk; i_++)
+            {
+                if(token_ids_host(token_id_, i_) == expert_id_)
+                    return i_;
+            }
+            throw std::runtime_error("not correct token/expert pair\n");
+            return -1; // TODO: not correct!!
+        };
+        ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
        if(i_token >= tokens)
            return;
        ck_tile::index_t i_topk = get_topk_id(i_token, i_expert); // TODO: ugly
-        auto weight             = sorted_weight_host.mData[i_flatten];
+#endif
+        auto weight = sorted_weight_host.mData[i_flatten];

        ck_tile::HostTensor<AccDataType> acc_0({1, intermediate_size_0});
        // first gemm

--- a/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
+++ b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
@@ -8,16 +8,40 @@

 namespace ck_tile {

+// Note: for simplicity, each functor only care about single M
+struct reference_rmsnorm2d_default_epilogue
+{
+    template <typename OutDataType, typename AccDataType>
+    void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
+    {
+        const int N = acc.mDesc.get_lengths()[1];
+        for(int n = 0; n < N; ++n)
+        {
+            o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
+        }
+    }
+
+    template <typename OutDataType, typename AccDataType>
+    auto operator()(int m, const HostTensor<AccDataType>& acc)
+    {
+        HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
+        operator()(m, o, acc);
+        return o;
+    }
+};
+
 template <typename XDataType,
          typename GammaDataType,
          typename ComputeDataType,
          typename YDataType,
-          typename InvRmsDataType>
+          typename InvRmsDataType,
+          typename Epilogue = reference_rmsnorm2d_default_epilogue>
 void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
                             const HostTensor<GammaDataType>& gamma_n,
                             HostTensor<YDataType>& y_m_n,
                             HostTensor<InvRmsDataType>& invRms_m,
-                             ComputeDataType epsilon)
+                             ComputeDataType epsilon,
+                             Epilogue epilogue_functor = {})
 {
    auto rmsnorm2d_fwd_func = [&](auto m) {
        const int N = x_m_n.mDesc.get_lengths()[1];
@@ -37,13 +61,15 @@ void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
        if constexpr(!std::is_same_v<InvRmsDataType, ck_tile::null_type>)
            invRms_m(m) = ck_tile::type_convert<InvRmsDataType>(divisor);

+        HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
        for(int n = 0; n < N; ++n)
        {
            ComputeDataType x     = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
            ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
-            auto y                = x * divisor * gamma;
-            y_m_n(m, n)           = ck_tile::type_convert<YDataType>(y);
+            acc(m, n)             = x * divisor * gamma;
        }
+
+        epilogue_functor(m, y_m_n, acc);
    };

    make_ParallelTensorFunctor(rmsnorm2d_fwd_func, invRms_m.mDesc.get_lengths()[0])(

--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck_tile/ops/welford.hpp
+++ b/include/ck_tile/ops/welford.hpp
@@ -3,8 +3,9 @@

 #pragma once

-#include "ck_tile/ops/welford/block/block_welford.hpp"
-#include "ck_tile/ops/welford/block/block_welford_problem.hpp"
-#include "ck_tile/ops/welford/thread/thread_welford.hpp"
+#include "ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp"
+#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp"
+#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp"
+#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp"
 #include "ck_tile/ops/common/generic_2d_block_shape.hpp"
 #include "ck_tile/ops/common/tensor_layout.hpp"
--- a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
+++ b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+struct BatchedTransposeHostArgs
+{
+    const void* p_input;
+    void* p_output;
+    index_t batch;
+    index_t height;
+    index_t width;
+    // index_t dim_blocks;
+    index_t dim_stride;
+    index_t dim_block_h;
+    index_t dim_block_w;
+};
+
+template <typename Pipeline_>
+struct BatchedTransposeKernel
+{
+    using Pipeline = remove_cvref_t<Pipeline_>;
+    using Problem  = remove_cvref_t<typename Pipeline::Problem>;
+
+    using Type = typename Problem::InputType;
+
+    struct BatchedTransposeKargs
+    {
+        const void* p_input;
+        void* p_output;
+        index_t batch;
+        index_t height;
+        index_t width;
+        index_t dim_stride;
+    };
+
+    using Kargs = BatchedTransposeKargs;
+    using Hargs = BatchedTransposeHostArgs;
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
+    {
+        size_t grid_size_x = (h.width + h.dim_block_w - 1) / h.dim_block_w;
+        size_t grid_size_y = (h.height + h.dim_block_h - 1) / h.dim_block_h;
+        size_t grid_size_z = h.batch;
+        return dim3(grid_size_x, grid_size_y, grid_size_z);
+    }
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_input    = h.p_input;
+        k.p_output   = h.p_output;
+        k.batch      = h.batch;
+        k.height     = h.height;
+        k.width      = h.width;
+        k.dim_stride = h.dim_stride;
+        return k;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto BlockSize() { return Problem::kBlockSize; }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+
+        static constexpr ck_tile::index_t kMPerBlock = Problem::kMPerBlock;
+        static constexpr ck_tile::index_t kNPerBlock = Problem::kNPerBlock;
+        static constexpr bool kPadM                  = Problem::kPadM;
+        static constexpr bool kPadN                  = Problem::kPadN;
+
+        static constexpr ck_tile::index_t kMPerThread = Problem::kMPerThread;
+        static constexpr ck_tile::index_t kNPerThread = Problem::kNPerThread;
+
+        static_assert(kMPerThread == 1 && kNPerThread == 1);
+
+        const auto iDim  = blockIdx.z;
+        const auto x_m_n = [&]() {
+            const auto x_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<const Type*>(kargs.p_input) + iDim * kargs.dim_stride,
+                make_tuple(kargs.height, kargs.width),
+                make_tuple(kargs.width, 1),
+                number<kNPerThread>{}, // TODO thread load value
+                number<1>{});
+
+            return pad_tensor_view(x_dram_naive,
+                                   make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
+                                   sequence<kPadM, kPadN>{});
+        }();
+
+        const auto iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
+        const auto iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock);
+
+        const auto y_n_m = [&]() {
+            const auto y_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+                static_cast<Type*>(kargs.p_output) + iDim * kargs.dim_stride,
+                make_tuple(kargs.width, kargs.height),
+                make_tuple(kargs.height, 1),
+                number<kMPerThread>{},
+                number<1>{});
+
+            return pad_tensor_view(y_dram_naive,
+                                   make_tuple(number<kNPerBlock>{}, number<kMPerBlock>{}),
+                                   sequence<kPadN, kPadM>{});
+        }();
+
+        auto x_block_window =
+            make_tile_window(x_m_n,
+                             make_tuple(number<kMPerBlock>{}, number<kNPerBlock>{}),
+                             {static_cast<ck_tile::index_t>(iM * kMPerBlock),
+                              static_cast<ck_tile::index_t>(iN * kNPerBlock)});
+
+        auto y_block_window =
+            make_tile_window(y_n_m,
+                             make_tuple(number<kNPerBlock>{}, number<kMPerBlock>{}),
+                             {static_cast<ck_tile::index_t>(iN * kNPerBlock),
+                              static_cast<ck_tile::index_t>(iM * kMPerBlock)});
+
+        Pipeline{}(x_block_window, y_block_window);
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_pipeline.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp"
+#include <string>
+#include <type_traits>
+
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_ = BatchedTransposePolicy>
+struct BatchedTransposePipeline
+{
+    // TODO: this kernel only support warp per row
+    using Problem   = remove_cvref_t<Problem_>;
+    using Policy    = remove_cvref_t<Policy_>;
+    using InputType = ck_tile::remove_cvref_t<typename Problem::InputType>;
+    static constexpr ck_tile::index_t kMPerBlock = Problem::kMPerBlock;
+    static constexpr ck_tile::index_t kNPerBlock = Problem::kNPerBlock;
+    static constexpr index_t AlignmentM          = Problem::AlignmentM;
+    static constexpr index_t AlignmentN          = Problem::AlignmentN;
+    static constexpr bool kPadM                  = Problem::kPadM;
+    static constexpr bool kPadN                  = Problem::kPadN;
+
+    template <typename InputWindow, typename OutputWindow>
+    CK_TILE_DEVICE auto operator()(const InputWindow& input_window, OutputWindow& out_window)
+    {
+        auto inp_win =
+            make_tile_window(input_window, Policy::template MakeInputDistribution<Problem>());
+        auto out_win =
+            make_tile_window(out_window, Policy::template MakeOutputDistribution<Problem>());
+
+        auto x = load_tile(inp_win); // x->thread input_win->block
+
+        auto y = make_static_distributed_tensor<InputType>(
+            Policy::template MakeOutputDistribution<Problem>());
+
+        constexpr auto span_2d_x = decltype(x)::get_distributed_spans();
+
+        sweep_tile_span(span_2d_x[number<0>{}], [&](auto idx0) {
+            sweep_tile_span(span_2d_x[number<1>{}], [&](auto idx1) {
+                constexpr auto i_j_idx = make_tuple(idx1, idx0);
+                y(i_j_idx)             = x(i_j_idx);
+            });
+        });
+
+        store_tile(out_win, y);
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/softmax.hpp"
+#include "ck_tile/ops/topk.hpp"
+
+namespace ck_tile {
+
+struct BatchedTransposePolicy
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeInputDistribution()
+    {
+        using S = Problem;
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<S::kMWarpPerBlock, S::kMThreadPerWarp, S::kMPerThread>,
+                      sequence<S::kNWarpPerBlock, S::kNThreadPerWarp, S::kNPerThread>>,
+                tuple<sequence<1, 2>, sequence<1, 2>>,
+                tuple<sequence<0, 0>, sequence<1, 1>>,
+                sequence<1, 2>,
+                sequence<2, 2>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeOutputDistribution()
+    {
+        using S = Problem;
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,
+                tuple<sequence<S::kNWarpPerBlock, S::kNThreadPerWarp, S::kNPerThread>,
+                      sequence<S::kMWarpPerBlock, S::kMThreadPerWarp, S::kMPerThread>>,
+                tuple<sequence<2, 1>, sequence<2, 1>>,
+                tuple<sequence<0, 0>, sequence<1, 1>>,
+                sequence<2, 1>,
+                sequence<2, 2>>{});
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp
+++ b/include/ck_tile/ops/batched_transpose/pipeline/batched_transpose_problem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include <string>
+#include <type_traits>
+
+#define VectorLoadSize 16
+
+namespace ck_tile {
+
+template <typename InputType_,
+          typename BlockTile,  // Sequence<...
+          typename WarpTile,   // Sequence<...
+          typename ThreadTile, // Sequence<...
+          bool kPadM_ = true,
+          bool kPadN_ = true>
+struct BatchedTransposeProblem
+{
+    using InputType = remove_cvref_t<InputType_>;
+
+    static constexpr index_t kMPerThread = ThreadTile::at(number<0>{});
+    static constexpr index_t kNPerThread = ThreadTile::at(number<1>{});
+
+    static constexpr index_t kMPerWarp = WarpTile::at(number<0>{});
+    static constexpr index_t kNPerWarp = WarpTile::at(number<1>{});
+
+    static constexpr index_t kMThreadPerWarp = kMPerWarp / kMPerThread;
+    static constexpr index_t kNThreadPerWarp = kNPerWarp / kNPerThread;
+
+    static constexpr index_t kMPerBlock = BlockTile::at(number<0>{});
+    static constexpr index_t kNPerBlock = BlockTile::at(number<1>{});
+
+    static constexpr index_t kMWarpPerBlock = kMPerBlock / kMPerWarp;
+    static constexpr index_t kNWarpPerBlock = kNPerBlock / kNPerWarp;
+
+    static constexpr index_t kBlockSize =
+        kMThreadPerWarp * kNThreadPerWarp * kMWarpPerBlock * kNWarpPerBlock;
+
+    static constexpr bool kPadM = kPadM_;
+    static constexpr bool kPadN = kPadN_;
+
+    static constexpr index_t AlignmentM = kPadM ? VectorLoadSize / sizeof(InputType) : 1; // TODO
+    static constexpr index_t AlignmentN = kPadN ? VectorLoadSize / sizeof(InputType) : 1;
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/common.hpp
+++ b/include/ck_tile/ops/common.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck_tile/ops/elementwise.hpp
+++ b/include/ck_tile/ops/elementwise.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once


--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -719,7 +719,82 @@ struct Silu
        constexpr T one = type_convert<T>(1);
        y               = x * (one / (one + ck_tile::exp(-x)));
    };
+
+    template <>
+    CK_TILE_HOST_DEVICE void operator()<fp32x2_t>(fp32x2_t& y, const fp32x2_t& x) const
+    {
+        constexpr auto one = type_convert<float>(1);
+        y[0]               = x[0] * __builtin_amdgcn_rcpf(one + ck_tile::exp(-x[0]));
+        y[1]               = x[1] * __builtin_amdgcn_rcpf(one + ck_tile::exp(-x[1]));
+    };
+};
+
+#if 0
+// Silu, the formular is not so good to do inline asm (dependency)
+// we put the code here purposely if in the future ppl want to try
+struct SiluAsm
+{
+    template <typename T>
+    CK_TILE_HOST void operator()(T& y, T& x) const
+    {
+        static_assert(std::is_same_v<T, float>, "Data type is not supported by this operation!");
+        constexpr T one = type_convert<T>(1);
+        y               = x * (one / (one + ck_tile::exp(-x)));
+    };
+
+    template <typename T>
+    CK_TILE_DEVICE void operator()(T& y, T& x) const
+    {
+        static_assert(std::is_same_v<T, float>, "Data type is not supported by this operation!");
+
+        const uint32_t log2e_neg_ = 0x3fb8aa3b | 0x80000000; // log2e_v<float> * -1;
+
+        // NOTE: x/y can't be same register before inline asm
+        // "+v" as y, "v" as x is not enought, x/y stil maybe put to same register
+        T tmp = x;
+        asm volatile("v_mul_f32 %[v_y], %[s_log2e], %[v_x]\n"
+                     "v_exp_f32 %[v_y], %[v_y]\n"
+                     "s_nop 0           ; hazard for exp\n"
+                     "v_add_f32 %[v_y], %[v_y], 1.0\n"
+                     "v_rcp_f32 %[v_y], %[v_y]\n"
+                     "s_nop 0           ; hazard for rcp\n"
+                     "v_mul_f32 %[v_y], %[v_x], %[v_y]\n"
+                     : [v_y] "+v"(y), [v_x] "+v"(tmp)
+                     : [s_log2e] "s"(log2e_neg_)
+                     :);
+    };
+
+    template <>
+    CK_TILE_HOST void operator()<fp32x2_t>(fp32x2_t& y, fp32x2_t& x) const
+    {
+        constexpr auto one = type_convert<float>(1);
+        y[0]               = x[0] * (one / (one + ck_tile::exp(-x[0])));
+        y[1]               = x[1] * (one / (one + ck_tile::exp(-x[1])));
+    };
+
+    template <>
+    CK_TILE_DEVICE void operator()<fp32x2_t>(fp32x2_t& y, fp32x2_t& x) const
+    {
+        const uint32_t log2e_neg_ = 0x3fb8aa3b | 0x80000000; // log2e_v<float> * -1;
+
+        // NOTE: x/y can't be same register before inline asm
+        // float tmp0 = x[0], tmp1 = x[1];
+        asm volatile("v_mul_f32 %[v_y0], %[s_log2e], %[v_x0]\n"
+                     "v_mul_f32 %[v_y1], %[s_log2e], %[v_x1]\n"
+                     "v_exp_f32 %[v_y0], %[v_y0]\n"
+                     "v_exp_f32 %[v_y1], %[v_y1]\n"
+                     "v_add_f32 %[v_y0], %[v_y0], 1.0\n"
+                     "v_add_f32 %[v_y1], %[v_y1], 1.0\n"
+                     "v_rcp_f32 %[v_y0], %[v_y0]\n"
+                     "v_rcp_f32 %[v_y1], %[v_y1]\n"
+                     "v_mul_f32 %[v_y0], %[v_x0], %[v_y0]\n"
+                     "v_mul_f32 %[v_y1], %[v_x1], %[v_y1]\n"
+                     : [v_y0] "+v"(y[0]), [v_y1] "+v"(y[1]), [v_x0] "+v"(x[0]), [v_x1] "+v"(x[1])
+                     : [s_log2e] "s"(log2e_neg_)
+                     :);
+    };
 };
+#endif

 struct TanH
 {

--- a/include/ck_tile/ops/epilogue.hpp
+++ b/include/ck_tile/ops/epilogue.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once