Merge branch 'develop' into ck_tile/fmha_receipt_aiter

ec959387 · rocking · GitHub · c1e2fef7 · 0e5e29c4 · ec959387
Unverified Commit ec959387 authored Feb 13, 2025 by rocking Committed by GitHub Feb 13, 2025
20 changed files
--- a/include/ck/utility/debug.hpp
+++ b/include/ck/utility/debug.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef UTILITY_DEBUG_HPP
 #define UTILITY_DEBUG_HPP
+#include "type.hpp"
 namespace ck {
 namespace debug {

--- a/include/ck/utility/e8m0.hpp
+++ b/include/ck/utility/e8m0.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/type.hpp"
+namespace ck {
+/**
+ * @brief Unsigned representation of a conventional biased Float32 exponent.
+ *
+ * bias = 127;
+ *
+ * E8M0_1   = 0b01111111; => 2^(127-127) = 1
+ * E8M0_2   = 0b10000000; => 2^(128-127) = 2^1 = 2
+ * E8M0_3   = 0b10000010; => 2^(130-127) = 2^3 = 8
+ * E8M0_135 = 0b10000111; => 2^(135-127) = 2^8 = 256
+ * E8M0_142 = 0b10001110; => 2^(142-127) = 2^15 = 32768
+ * E8M0_MIN = 0b00000000; => 2^-127
+ * E8M0_MAX = 0b11111110; => 2^127
+ * E8M0_NAN = 0b11111111; => NaN
+ */
+struct e8m0_bexp_t
+{
+    using type = uint8_t;
+    type data;
+    constexpr static type bias     = 127;
+    constexpr static type nan_mask = 0xFF;
+    __host__ __device__ constexpr e8m0_bexp_t() : data{type{}} {}
+    __host__ __device__ constexpr e8m0_bexp_t(type init) : data{init} {}
+    __host__ __device__ constexpr e8m0_bexp_t(int init) : data{static_cast<type>(init & nan_mask)}
+    {
+    }
+    __host__ __device__ explicit constexpr e8m0_bexp_t(float scale)
+        : data{static_cast<type>((bit_cast<uint32_t>(scale) & (nan_mask << 23)) >> 23)}
+    {
+    }
+    __host__ __device__ explicit constexpr operator float() const
+    {
+        if(data == nan_mask || data == 0)
+        {
+            uint32_t bits = data << 1;
+            bits |= 1;
+            bits <<= 22;
+            return bit_cast<float>(bits);
+        }
+        else
+        {
+            uint32_t bits = data << 23;
+            return bit_cast<float>(bits);
+        }
+    }
+    __host__ __device__ constexpr bool operator==(const e8m0_bexp_t& other) const
+    {
+        // strict IEEE compliance for NaN
+        return data == other.data && data != nan_mask;
+    }
+    __host__ __device__ constexpr bool is_nan() const { return data == nan_mask; }
+};
+namespace utils {
+template <typename T>
+__host__ __device__ inline int get_exponent_value(T x);
+template <>
+__host__ __device__ inline int get_exponent_value<e8m0_bexp_t>(e8m0_bexp_t x)
+{
+    return x.data;
+}
+} // namespace utils
+} // namespace ck
--- a/include/ck/utility/enable_if.hpp
+++ b/include/ck/utility/enable_if.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 namespace ck {
+#ifndef CK_CODE_GEN_RTC
 template <bool B, typename T = void>
 using enable_if = std::enable_if<B, T>;
 template <bool B, typename T = void>
 using enable_if_t = typename std::enable_if<B, T>::type;
+#else
+template <bool B, class T = void>
+struct enable_if
+{
+};
+template <class T>
+struct enable_if<true, T>
+{
+    using type = T;
+};
+template <bool B, class T = void>
+using enable_if_t = typename enable_if<B, T>::type;
+#endif
 } // namespace ck
--- a/include/ck/utility/env.hpp
+++ b/include/ck/utility/env.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#ifndef CK_CODE_GEN_RTC
 #pragma once
 #include <cstdlib>
@@ -183,3 +184,4 @@ void UpdateEnvVar(EnvVar, const std::string_view& val)
 }
 } // namespace ck
+#endif
--- a/include/ck/utility/functional.hpp
+++ b/include/ck/utility/functional.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -120,11 +120,11 @@ constexpr auto conditional_expr(X&& x, Y&& y)
 {
    if constexpr(predicate)
    {
-        return std::forward<X>(x);
+        return ck::forward<X>(x);
    }
    else
    {
-        return std::forward<Y>(y);
+        return ck::forward<Y>(y);
    }
 }

--- a/include/ck/utility/functional4.hpp
+++ b/include/ck/utility/functional4.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_FUNCTIONAL4_HPP
 #define CK_FUNCTIONAL4_HPP
@@ -21,7 +21,7 @@ struct unpack_impl<Sequence<Is...>>
    template <typename F, typename X>
    __host__ __device__ constexpr auto operator()(F&& f, X&& x) const
    {
-        return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...);
+        return ck::forward<F>(f)(ck::forward<X>(x).At(Number<Is>{})...);
    }
 };
@@ -35,8 +35,8 @@ struct unpack2_impl<Sequence<Is...>, Sequence<Js...>>
    template <typename F, typename X, typename Y>
    __host__ __device__ constexpr auto operator()(F&& f, X&& x, Y&& y) const
    {
-        return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...,
+        return ck::forward<F>(f)(ck::forward<X>(x).At(Number<Is>{})...,
-                                  std::forward<Y>(y).At(Number<Js>{})...);
+                                 ck::forward<Y>(y).At(Number<Js>{})...);
    }
 };
@@ -47,7 +47,7 @@ __host__ __device__ constexpr auto unpack(F&& f, X&& x)
 {
    using X_ = remove_reference_t<X>;
    return detail::unpack_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type>{}(
-        std::forward<F>(f), std::forward<X>(x));
+        ck::forward<F>(f), ck::forward<X>(x));
 }
 // TODO: properly implement unpack that takes any number of containers
@@ -58,7 +58,7 @@ __host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y)
    using Y_ = remove_reference_t<Y>;
    return detail::unpack2_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type,
                                typename arithmetic_sequence_gen<0, Y_::Size(), 1>::type>{}(
-        std::forward<F>(f), std::forward<X>(x), std::forward<Y>(y));
+        ck::forward<F>(f), ck::forward<X>(x), ck::forward<Y>(y));
 }
 } // namespace ck

--- a/include/ck/utility/integral_constant.hpp
+++ b/include/ck/utility/integral_constant.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -48,4 +48,9 @@ __host__ __device__ constexpr auto operator%(integral_constant<TX, X>, integral_
    return integral_constant<decltype(X % Y), X % Y>{};
 }
+template <bool B>
+using bool_constant = integral_constant<bool, B>;
+using true_type  = bool_constant<true>;
+using false_type = bool_constant<false>;
 } // namespace ck
--- a/include/ck/utility/is_detected.hpp
+++ b/include/ck/utility/is_detected.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
+#include "ck/utility/integral_constant.hpp"
 namespace ck {
 namespace detail {
 template <class Default, class AlwaysVoid, template <class...> class Op, class... Args>
 struct detector
 {
-    using value_t = std::false_type;
+    using value_t = integral_constant<bool, false>;
    using type    = Default;
 };
 template <class Default, template <class...> class Op, class... Args>
-struct detector<Default, std::void_t<Op<Args...>>, Op, Args...>
+struct detector<Default, ck::void_t<Op<Args...>>, Op, Args...>
 {
-    using value_t = std::true_type;
+    using value_t = integral_constant<bool, true>;
    using type    = Op<Args...>;
 };
 } // namespace detail
@@ -32,12 +34,12 @@ template <template <class...> class Op, class... Args>
 using is_detected = typename detail::detector<nonesuch, void, Op, Args...>::value_t;
 template <typename T>
-using is_pack2_invocable_t = decltype(std::declval<T&>().is_pack2_invocable);
+using is_pack2_invocable_t = decltype(ck::declval<T&>().is_pack2_invocable);
 template <typename T>
-using is_pack4_invocable_t = decltype(std::declval<T&>().is_pack4_invocable);
+using is_pack4_invocable_t = decltype(ck::declval<T&>().is_pack4_invocable);
 template <typename T>
-using is_pack8_invocable_t = decltype(std::declval<T&>().is_pack8_invocable);
+using is_pack8_invocable_t = decltype(ck::declval<T&>().is_pack8_invocable);
 } // namespace ck
--- a/include/ck/utility/loop_scheduler.hpp
+++ b/include/ck/utility/loop_scheduler.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+#ifndef CK_CODE_GEN_RTC
 #include <ostream>
+#endif
 #pragma once
@@ -25,6 +28,7 @@ constexpr LoopScheduler make_default_loop_scheduler()
 } // namespace ck
+#ifndef CK_CODE_GEN_RTC
 inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
 {
    switch(s)
@@ -35,3 +39,4 @@ inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
    }
    return os;
 }
+#endif
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -9,6 +9,10 @@
 #include "type.hpp"
 #include "tuple.hpp"
+#ifdef CK_CODE_GEN_RTC
+#define INT32_MAX 2147483647
+#endif
 namespace ck {
 // magic number division

--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -19,7 +19,7 @@ extern "C" __device__ float __ocml_native_recip_f32(float);
 #endif
 // math functions for the host,  some are implemented by calling C++ std functions
+#ifndef CK_CODE_GEN_RTC
 static inline __host__ float abs(float x) { return std::abs(x); };
 static inline __host__ double abs(double x) { return std::abs(x); };
@@ -459,7 +459,7 @@ inline __host__ double expm1<double>(double x)
 {
    return std::expm1(x);
 }
+#endif
 // math functions for the HIP kernel,  some are implemented by calling hip builtin functions
 static inline __device__ float abs(float x) { return ::abs(x); };

--- a/include/ck/utility/mxf4_utils.hpp
+++ b/include/ck/utility/mxf4_utils.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/mxfp_utils.hpp"
+namespace ck::utils {
+template <>
+__host__ __device__ inline bool is_nan<f4_t>(e8m0_bexp_t const scale,
+                                             f4_t const dataBytes [[maybe_unused]])
+{
+    // no need to check for data as it does not have NaN representation
+    return scale == NumericLimits<e8m0_bexp_t>::QuietNaN();
+}
+// no infinity representation in ocp_e2m1_mxfp4 will always return false
+template <>
+__host__ __device__ inline bool is_inf<f4_t>(e8m0_bexp_t const scale [[maybe_unused]],
+                                             f4_t const data [[maybe_unused]])
+{
+    // no inf representation for ocp_e2m1_mxfp4
+    return false;
+}
+template <>
+__host__ __device__ inline bool is_zero<f4_t>(e8m0_bexp_t const scale, f4_t const data)
+{
+    if(is_nan<f4_t>(scale, data))
+        return false;
+    // no need to check for scale as it does not have a 0 representation
+    f4_t result = (data & 0b00001111) & NumericUtils<f4_t>::set_sign_mask;
+    return result == 0b0;
+}
+template <>
+__host__ __device__ inline float to_float<f4_t>(e8m0_bexp_t const scale, f4_t const data)
+{
+    if(is_nan<f4_t>(scale, data))
+        return std::numeric_limits<float>::quiet_NaN();
+    if(is_zero<f4_t>(scale, data))
+        return 0.0f;
+    f4_t prepared_data = data & 0b00001111;
+    int scale_exp = get_exponent_value<e8m0_bexp_t>(scale);
+    return convert_to_float<f4_t>(prepared_data, scale_exp);
+}
+template <>
+__host__ __device__ inline f4_t sat_convert_to_type<f4_t>(float value)
+{
+    cvt t;
+    t.value_float = value;
+    uint32_t sign = t.value_bitwise >> 31;
+    if(std::isnan(value))
+    {
+        return sign ? NumericUtils<f4_t>::data_max_negative_normal_mask
+                    : NumericUtils<f4_t>::data_max_positive_normal_mask;
+    }
+    if(std::abs(value) > NumericLimits<f4_t>::Max()) // covers inf case as well
+        return sign ? NumericUtils<f4_t>::data_max_negative_normal_mask
+                    : NumericUtils<f4_t>::data_max_positive_normal_mask;
+    f4_t res = convert_to_type<f4_t>(value);
+    if(std::abs(to_float<f4_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), res)) <
+       NumericLimits<f4_t>::DataMinSubnorm())
+        return value < 0 ? NumericUtils<f4_t>::negative_zero_mask
+                         : NumericUtils<f4_t>::positive_zero_mask;
+    return res;
+}
+template <>
+__host__ __device__ inline f4_t sat_convert_to_type_sr<f4_t>(float value, uint32_t seed)
+{
+    cvt t;
+    t.value_float = value;
+    uint32_t sign = t.value_bitwise >> 31;
+    if(std::isnan(value))
+        return sign ? NumericUtils<f4_t>::data_max_negative_normal_mask
+                    : NumericUtils<f4_t>::data_max_positive_normal_mask;
+    if(std::abs(value) > NumericLimits<f4_t>::Max()) // covers inf case as well
+        return sign ? NumericUtils<f4_t>::data_max_negative_normal_mask
+                    : NumericUtils<f4_t>::data_max_positive_normal_mask;
+    f4_t res = convert_to_type_sr<f4_t>(value, seed);
+    if(std::abs(to_float<f4_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), res)) <
+       NumericLimits<f4_t>::DataMinSubnorm())
+        return value < 0 ? NumericUtils<f4_t>::negative_zero_mask
+                         : NumericUtils<f4_t>::positive_zero_mask;
+    return res;
+}
+} // namespace ck::utils
--- a/include/ck/utility/mxf6_utils.hpp
+++ b/include/ck/utility/mxf6_utils.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/mxfp_utils.hpp"
+namespace ck::utils {
+/**
+ * @brief Checks if an f6_t value is NaN based on the provided scale.
+ *
+ * For f6_t data, NaN cannot be represented directly. Instead, this function
+ * determines NaN by checking if the scale is set to a quiet NaN.
+ *
+ * @param scale     The exponent scale factor (e8m0_bexp_t) used for f6_t.
+ * @param dataBytes The f6_t value to check (unused in this implementation).
+ * @return true if the scale indicates a NaN value, false otherwise.
+ */
+template <>
+__host__ __device__ inline bool is_nan<f6_t>(e8m0_bexp_t const scale,
+                                             f6_t const dataBytes [[maybe_unused]])
+{
+    // no need to check for data as it does not have NaN representation
+    return scale.is_nan();
+}
+/**
+ * @brief Checks if an bf6_t value is NaN based on the provided scale.
+ *
+ * For bf6_t data, NaN cannot be represented directly. Instead, this function
+ * determines NaN by checking if the scale is set to a quiet NaN.
+ *
+ * @param scale     The exponent scale factor (e8m0_bexp_t) used for bf6_t.
+ * @param dataBytes The bf6_t value to check (unused in this implementation).
+ * @return true if the scale indicates a NaN value, false otherwise.
+ */
+template <>
+__host__ __device__ inline bool is_nan<bf6_t>(e8m0_bexp_t const scale,
+                                              bf6_t const dataBytes [[maybe_unused]])
+{
+    // no need to check for data as it does not have NaN representation
+    return scale.is_nan();
+}
+/**
+ * @brief Checks if an f6_t value is infinite.
+ *
+ * Because f6_t does not support infinite values, this function always returns false.
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t) used for f6_t.
+ * @param data  The f6_t value to check.
+ * @return      Always false, as infinity is not represented in f6_t.
+ */
+template <>
+__host__ __device__ inline bool is_inf<f6_t>(e8m0_bexp_t const scale [[maybe_unused]],
+                                             f6_t const data [[maybe_unused]])
+{
+    // no inf representation for fp6
+    return false;
+}
+/**
+ * @brief Checks if an bf6_t value is infinite.
+ *
+ * Because bf6_t does not support infinite values, this function always returns false.
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t) used for bf6_t.
+ * @param data  The bf6_t value to check.
+ * @return      Always false, as infinity is not represented in bf6_t.
+ */
+template <>
+__host__ __device__ inline bool is_inf<bf6_t>(e8m0_bexp_t const scale [[maybe_unused]],
+                                              bf6_t const data [[maybe_unused]])
+{
+    // no inf representation for bf6
+    return false;
+}
+/**
+ * @brief Checks whether an f6_t value is zero.
+ *
+ * If the specified f6_t is NaN, this function returns false.
+ * Otherwise, it masks out the sign bits and checks if the remaining bits
+ * are zero.
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t) used for f6_t.
+ * @param data  The f6_t value to check.
+ * @return true if the value is zero; otherwise false.
+ */
+template <>
+__host__ __device__ inline bool is_zero<f6_t>(e8m0_bexp_t const scale, f6_t const data)
+{
+    if(is_nan<f6_t>(scale, data))
+        return false;
+    // no need to check for scale as it does not have a 0 representation
+    f6_t result = (data & 0b00111111) & NumericUtils<f6_t>::set_sign_mask;
+    return result == 0b0;
+}
+/**
+ * @brief Checks whether an bf6_t value is zero.
+ *
+ * If the specified bf6_t is NaN, this function returns false.
+ * Otherwise, it masks out the sign bits and checks if the remaining bits
+ * are zero.
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t) used for bf6_t.
+ * @param data  The bf6_t value to check.
+ * @return true if the value is zero; otherwise false.
+ */
+template <>
+__host__ __device__ inline bool is_zero<bf6_t>(e8m0_bexp_t const scale, bf6_t const data)
+{
+    if(is_nan<bf6_t>(scale, data))
+        return false;
+    // no need to check for scale as it does not have a 0 representation
+    bf6_t result = (data & 0b00111111) & NumericUtils<bf6_t>::set_sign_mask;
+    return result == 0b0;
+}
+/**
+ * @brief Converts an f6_t value to a float based on an e8m0_bexp_t scale factor.
+ *
+ * Checks if the f6_t value is NaN or zero before performing the conversion.
+ * Applies the exponent from the scale to compute the final float result.
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t) used for f6_t.
+ * @param data  The f6_t value to convert.
+ * @return      The converted float value.
+ */
+template <>
+__host__ __device__ inline float to_float<f6_t>(e8m0_bexp_t const scale, f6_t const data)
+{
+    if(is_nan<f6_t>(scale, data))
+        return std::numeric_limits<float>::quiet_NaN();
+    if(is_zero<f6_t>(scale, data))
+        return 0.0f;
+    f6_t prepared_data = data & 0b00111111;
+    int scale_exp = get_exponent_value<e8m0_bexp_t>(scale);
+    return convert_to_float<f6_t>(prepared_data, scale_exp);
+}
+/**
+ * @brief Converts an bf6_t value to a float based on an e8m0_bexp_t scale factor.
+ *
+ * Checks if the bf6_t value is NaN or zero before performing the conversion.
+ * Applies the exponent from the scale to compute the final float result.
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t) used for bf6_t.
+ * @param data  The bf6_t value to convert.
+ * @return      The converted float value.
+ */
+template <>
+__host__ __device__ inline float to_float<bf6_t>(e8m0_bexp_t const scale, bf6_t const data)
+{
+    if(is_nan<bf6_t>(scale, data))
+        return std::numeric_limits<float>::quiet_NaN();
+    if(is_zero<bf6_t>(scale, data))
+        return 0.0f;
+    bf6_t prepared_data = data & 0b00111111;
+    int scale_exp = get_exponent_value<e8m0_bexp_t>(scale);
+    return convert_to_float<bf6_t>(prepared_data, scale_exp);
+}
+/**
+ * @brief Converts a float to f6_t with saturation.
+ *
+ * If the input is NaN or exceeds the representable range for f6_t, returns
+ * the corresponding max normal mask. Handles subnormal cases by returning
+ * zero with the appropriate sign.
+ *
+ * @param value The float value to be converted.
+ * @return      The saturated f6_t value.
+ */
+template <>
+__host__ __device__ inline f6_t sat_convert_to_type<f6_t>(float value)
+{
+    cvt t;
+    t.value_float = value;
+    uint32_t sign = t.value_bitwise >> 31;
+    if(std::isnan(value))
+    {
+        return sign ? NumericUtils<f6_t>::data_max_negative_normal_mask
+                    : NumericUtils<f6_t>::data_max_positive_normal_mask;
+    }
+    if(std::abs(value) > NumericLimits<f6_t>::Max()) // covers inf case as well
+        return sign ? NumericUtils<f6_t>::data_max_negative_normal_mask
+                    : NumericUtils<f6_t>::data_max_positive_normal_mask;
+    f6_t res = convert_to_type<f6_t>(value);
+    if(std::abs(to_float<f6_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), res)) <
+       NumericLimits<f6_t>::DataMinSubnorm())
+        return sign ? NumericUtils<f6_t>::negative_zero_mask
+                    : NumericUtils<f6_t>::positive_zero_mask;
+    return res;
+}
+/**
+ * @brief Converts a float to bf6_t with saturation.
+ *
+ * If the input is NaN or exceeds the representable range for bf6_t, returns
+ * the corresponding max normal mask. Handles subnormal cases by returning
+ * zero with the appropriate sign.
+ *
+ * @param value The float value to be converted.
+ * @return      The saturated bf6_t value.
+ */
+template <>
+__host__ __device__ inline bf6_t sat_convert_to_type<bf6_t>(float value)
+{
+    cvt t;
+    t.value_float = value;
+    uint32_t sign = t.value_bitwise >> 31;
+    if(std::isnan(value))
+    {
+        return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
+                    : NumericUtils<bf6_t>::data_max_positive_normal_mask;
+    }
+    if(std::abs(value) > NumericLimits<bf6_t>::Max()) // covers inf case as well
+        return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
+                    : NumericUtils<bf6_t>::data_max_positive_normal_mask;
+    bf6_t res = convert_to_type<bf6_t>(value);
+    if(std::abs(to_float<bf6_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), res)) <
+       NumericLimits<bf6_t>::DataMinSubnorm())
+        return sign ? NumericUtils<bf6_t>::negative_zero_mask
+                    : NumericUtils<bf6_t>::positive_zero_mask;
+    return res;
+}
+/**
+ * @brief Converts a float to f6_t with saturation and stochastic rounding.
+ *
+ * If the input is NaN or exceeds the representable range for f6_t, returns
+ * the corresponding max normal mask. Handles subnormal cases by returning
+ * zero with the appropriate sign.
+ *
+ * @param value The float value to be converted.
+ * @return      The saturated f6_t value.
+ */
+template <>
+__host__ __device__ inline f6_t sat_convert_to_type_sr<f6_t>(float value, uint32_t seed)
+{
+    cvt t;
+    t.value_float = value;
+    uint32_t sign = t.value_bitwise >> 31;
+    if(std::isnan(value))
+        return sign ? NumericUtils<f6_t>::data_max_negative_normal_mask
+                    : NumericUtils<f6_t>::data_max_positive_normal_mask;
+    if(std::abs(value) > NumericLimits<f6_t>::Max()) // covers inf case as well
+        return sign ? NumericUtils<f6_t>::data_max_negative_normal_mask
+                    : NumericUtils<f6_t>::data_max_positive_normal_mask;
+    f6_t res = convert_to_type_sr<f6_t>(value, seed);
+    if(std::abs(to_float<f6_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), res)) <
+       NumericLimits<f6_t>::DataMinSubnorm())
+        return sign ? NumericUtils<f6_t>::negative_zero_mask
+                    : NumericUtils<f6_t>::positive_zero_mask;
+    return res;
+}
+/**
+ * @brief Converts a float to f6_t with saturation and stochastic rounding.
+ *
+ * If the input is NaN or exceeds the representable range for f6_t, returns
+ * the corresponding max normal mask. Handles subnormal cases by returning
+ * zero with the appropriate sign.
+ *
+ * @param value The float value to be converted.
+ * @return      The saturated f6_t value.
+ */
+template <>
+__host__ __device__ inline bf6_t sat_convert_to_type_sr<bf6_t>(float value, uint32_t seed)
+{
+    cvt t;
+    t.value_float = value;
+    uint32_t sign = t.value_bitwise >> 31;
+    if(std::isnan(value))
+        return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
+                    : NumericUtils<bf6_t>::data_max_positive_normal_mask;
+    if(std::abs(value) > NumericLimits<bf6_t>::Max()) // covers inf case as well
+        return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
+                    : NumericUtils<bf6_t>::data_max_positive_normal_mask;
+    bf6_t res = convert_to_type_sr<bf6_t>(value, seed);
+    if(std::abs(to_float<bf6_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), res)) <
+       NumericLimits<bf6_t>::DataMinSubnorm())
+        return sign ? NumericUtils<bf6_t>::negative_zero_mask
+                    : NumericUtils<bf6_t>::positive_zero_mask;
+    return res;
+}
+} // namespace ck::utils
--- a/include/ck/utility/mxf8_utils.hpp
+++ b/include/ck/utility/mxf8_utils.hpp
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/mxfp_utils.hpp"
+#if defined(__gfx950__) && __HIP_DEVICE_COMPILE__
+#define CK_MX_FP8_CVT_FAST_PATH 1
+#else
+#define CK_MX_FP8_CVT_FAST_PATH 0
+#endif
+namespace ck {
+namespace fp8_impl {
+#if CK_MX_FP8_CVT_FAST_PATH
+template <ck_fp8_interpretation_t interpret>
+static __device__ float cast_to_f32_from_f8_scaled(float scale, fp8_storage_t v)
+{
+    union
+    {
+        unsigned int i32val;
+        unsigned char i8val[4];
+    } val;
+    val.i8val[0] = v;
+    static_assert(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_OCP,
+                  "Only OCP interpretations are supported");
+    if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+    {
+        return __builtin_amdgcn_cvt_scalef32_f32_fp8(val.i32val, scale, 0);
+    }
+    else
+    {
+        return __builtin_amdgcn_cvt_scalef32_f32_bf8(val.i32val, scale, 0);
+    }
+}
+template <ck_fp8_interpretation_t interpret>
+static __device__ float2_t cast_to_f32x2_from_f8x2_scaled(float scale, fp8x2_storage_t v)
+{
+    const auto i16val = bit_cast<uint16_t>(v);
+    static_assert(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_OCP,
+                  "Only OCP interpretations are supported");
+    if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+    {
+        return __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(i16val, scale, 0);
+    }
+    else
+    {
+        return __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(i16val, scale, 0);
+    }
+}
+template <ck_fp8_interpretation_t interpret, bool stochastic_rounding = false>
+static __device__ fp8_storage_t cast_to_f8_from_f32_scaled(float v,
+                                                           unsigned int rng = 0,
+                                                           float scale      = 1.0f)
+{
+    fp8_storage_t i8data;
+    union
+    {
+        float fval;
+        unsigned int i32val;
+    } val;
+    union
+    {
+        uint32_t ival;
+        vector_type<int16_t, 2>::type v2i16;
+        fp8_storage_t v4i8[4];
+    } ret{};
+    // unsigned int ival = 0;
+    val.fval = v;
+    if constexpr(stochastic_rounding)
+    {
+        ret.ival =
+            (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+                ? __builtin_amdgcn_cvt_scalef32_sr_fp8_f32(ret.ival, val.fval, rng, scale, 0)
+                : __builtin_amdgcn_cvt_scalef32_sr_bf8_f32(ret.ival, val.fval, rng, scale, 0);
+        i8data = ret.v4i8[0];
+    }
+    else
+    {
+        // RNE CVT
+        // llvm.amdgcn.cvt.scalef32.pk.fp8.f32
+        // v2i16 old_vdst, float srcA, float srcB, float scale, bool dst_lo_hi_sel
+        if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+        {
+            // If fval / scale > max fp8, returns Nan
+            ret.v2i16 = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(/*old_vdst*/ ret.v2i16,
+                                                                 val.fval,
+                                                                 val.fval,
+                                                                 scale,
+                                                                 /*dst_lo_hi_sel*/ false);
+        }
+        else
+        {
+            // If fval / scale > max bf8, returns Inf
+            ret.v2i16 = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(/*old_vdst*/ ret.v2i16,
+                                                                 val.fval,
+                                                                 val.fval,
+                                                                 scale,
+                                                                 /*dst_lo_hi_sel*/ false);
+        }
+        i8data = ret.v4i8[0];
+    }
+    return i8data;
+}
+template <ck_fp8_interpretation_t interpret, bool stochastic_rounding = false>
+static __device__ fp8x2_storage_t cast_to_f8_from_f32_scaled(float2_t v,
+                                                             unsigned int rng = 0,
+                                                             float scale      = 1.0f)
+{
+    union
+    {
+        uint32_t ival;
+        vector_type<int16_t, 2>::type v2i16;
+        StaticallyIndexedArray<fp8x2_storage_t, 2> v2f8x2;
+    } ret{};
+    if constexpr(stochastic_rounding)
+    {
+        fp8x2_storage_t f8x2;
+        if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+        {
+            ret.ival = __builtin_amdgcn_cvt_scalef32_sr_fp8_f32(ret.ival, v[0], rng, scale, 0);
+            f8x2[0]  = ret.v2f8x2(Number<0>{})[0];
+            ret.ival = __builtin_amdgcn_cvt_scalef32_sr_fp8_f32(ret.ival, v[1], rng, scale, 0);
+            f8x2[1]  = ret.v2f8x2(Number<0>{})[0];
+        }
+        else
+        {
+            ret.ival = __builtin_amdgcn_cvt_scalef32_sr_bf8_f32(ret.ival, v[0], rng, scale, 0);
+            f8x2[0]  = ret.v2f8x2(Number<0>{})[0];
+            ret.ival = __builtin_amdgcn_cvt_scalef32_sr_bf8_f32(ret.ival, v[1], rng, scale, 0);
+            f8x2[1]  = ret.v2f8x2(Number<0>{})[0];
+        }
+        return f8x2;
+    }
+    else
+    {
+        // RNE CVT
+        // llvm.amdgcn.cvt.scalef32.pk.fp8.f32
+        // v2i16 old_vdst, float srcA, float srcB, float scale, bool dst_lo_hi_sel
+        if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+        {
+            // If fval / scale > max fp8, returns Nan
+            ret.v2i16 = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(/*old_vdst*/ ret.v2i16,
+                                                                 v[0],
+                                                                 v[1],
+                                                                 scale,
+                                                                 /*dst_lo_hi_sel*/ false);
+        }
+        else
+        {
+            // If fval / scale > max bf8, returns Inf
+            ret.v2i16 = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(/*old_vdst*/ ret.v2i16,
+                                                                 v[0],
+                                                                 v[1],
+                                                                 scale,
+                                                                 /*dst_lo_hi_sel*/ false);
+        }
+        return ret.v2f8x2(Number<0>{});
+    }
+}
+#endif // CK_MX_FP8_CVT_FAST_PATH
+#if CK_MX_FP8_CVT_FAST_PATH
+/**
+ * \brief convert float to @p fp8_storage_t with scaling
+ *
+ * This version is used when the fast path (MX FP8 hardware) is available
+ *
+ * \tparam interp interpretation of fp8
+ * \param f float number
+ * \param scale scaling factor
+ * \return fp8_storage_t
+ */
+template <ck_fp8_interpretation_t interp, bool stochastic_rounding = false>
+__host__ __device__ static inline fp8_storage_t cvt_float_to_fp8_scaled(const float f, float scale)
+{
+    __is_interpret_supported(interp);
+    uint32_t rng = 0;
+    if constexpr(stochastic_rounding)
+    {
+        constexpr int seed = 1254739;
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+    }
+    return cast_to_f8_from_f32_scaled<interp, stochastic_rounding>(f, rng, scale);
+}
+/**
+ * \brief convert 2xfloat to @p 2xfp8_storage_t with scaling
+ *
+ * This version is used when the fast path (MX FP8 hardware) is available
+ *
+ * \tparam interp interpretation of fp8
+ * \param f 2xfloat
+ * \param scale scaling factor
+ * \return 2xfp8_storage_t
+ */
+template <ck_fp8_interpretation_t interp, bool stochastic_rounding = false>
+__host__ __device__ static inline fp8x2_storage_t cvt_float_to_fp8_scaled(const float2_t f,
+                                                                          float scale)
+{
+    __is_interpret_supported(interp);
+    uint32_t rng = 0;
+    if constexpr(stochastic_rounding)
+    {
+        constexpr int seed = 1254739;
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f[0]);
+    }
+    return cast_to_f8_from_f32_scaled<interp, stochastic_rounding>(f, rng, scale);
+}
+#else
+/**
+ * \brief convert float to @p fp8_storage_t with scaling
+ *
+ * This version is used when the fast path (MX FP8 hardware) is not available
+ *
+ * \tparam interp interpretation of fp8
+ * \param f float number
+ * \param scale scaling factor
+ * \return fp8_storage_t
+ */
+template <ck_fp8_interpretation_t interp, bool stochastic_rounding = false>
+__host__ __device__ static inline fp8_storage_t cvt_float_to_fp8_scaled(const float f, float scale)
+{
+    static_assert(interp == ck_fp8_interpretation_t::CK_E4M3_OCP ||
+                      interp == ck_fp8_interpretation_t::CK_E5M2_OCP,
+                  "Only OCP interpretations are supported");
+    uint32_t rng = 0;
+    if constexpr(stochastic_rounding)
+    {
+        constexpr int seed = 1254739;
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+    }
+    if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_OCP)
+    {
+        return cast_to_f8<float, 3, 4, false, true, stochastic_rounding>(f / scale, rng);
+    }
+    else if constexpr(interp == ck_fp8_interpretation_t::CK_E5M2_OCP)
+    {
+        return cast_to_f8<float, 2, 5, false, true, stochastic_rounding>(f / scale, rng);
+    }
+    else
+    {
+        __hip_assert(false && "FP8 type is not supported by current target device");
+        return 0;
+    }
+}
+/**
+ * \brief convert two float to @p 2xfp8_storage_t with scaling
+ *
+ * This version is used when the fast path (MX FP8 hardware) is not available
+ *
+ * \tparam interp interpretation of fp8
+ * \param f 2xfloat
+ * \param scale scaling factor
+ * \return 2xfp8_storage_t
+ */
+template <ck_fp8_interpretation_t interp, bool stochastic_rounding = false>
+__host__ __device__ static inline fp8x2_storage_t cvt_float_to_fp8_scaled(const float2_t f,
+                                                                          float scale)
+{
+    static_assert(interp == ck_fp8_interpretation_t::CK_E4M3_OCP ||
+                      interp == ck_fp8_interpretation_t::CK_E5M2_OCP,
+                  "Only OCP interpretations are supported");
+    uint32_t rng = 0;
+    if constexpr(stochastic_rounding)
+    {
+        constexpr int seed = 1254739;
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f[0]);
+    }
+    if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_OCP)
+    {
+        return {cast_to_f8<float, 3, 4, false, true, stochastic_rounding>(f[0] / scale, rng),
+                cast_to_f8<float, 3, 4, false, true, stochastic_rounding>(f[1] / scale, rng)};
+    }
+    else if constexpr(interp == ck_fp8_interpretation_t::CK_E5M2_OCP)
+    {
+        return {cast_to_f8<float, 2, 5, false, true, stochastic_rounding>(f[0] / scale, rng),
+                cast_to_f8<float, 2, 5, false, true, stochastic_rounding>(f[1] / scale, rng)};
+    }
+    else
+    {
+        __hip_assert(false && "FP8 type is not supported by current target device");
+        return 0;
+    }
+}
+#endif // CK_MX_FP8_CVT_FAST_PATH
+} // namespace fp8_impl
+// Declare a template function for fp8 conversion using SR
+template <typename Y, typename X>
+__host__ __device__ constexpr Y mxf8_convert_sr(X x, float scale);
+// Declare a template function for fp8 conversion using RNE
+template <typename Y, typename X>
+__host__ __device__ constexpr Y mxf8_convert_rne(X x, float scale);
+// convert fp32 to fp8 with rounding to nearest even
+template <>
+inline __host__ __device__ f8_ocp_t mxf8_convert_rne<f8_ocp_t, float>(float x, float scale)
+{
+    return f8_ocp_t{fp8_impl::cvt_float_to_fp8_scaled<f8_ocp_t::default_interpret>(x, scale)};
+}
+// convert fp32 to bf8 with rounding to nearest even
+template <>
+inline __host__ __device__ bf8_ocp_t mxf8_convert_rne<bf8_ocp_t, float>(float x, float scale)
+{
+    return bf8_ocp_t{fp8_impl::cvt_float_to_fp8_scaled<bf8_ocp_t::default_interpret>(x, scale)};
+}
+// convert fp32x2 to fp8x2 with rounding to nearest even
+template <>
+inline __host__ __device__ f8x2_ocp_t mxf8_convert_rne<f8x2_ocp_t, float2_t>(float2_t x,
+                                                                             float scale)
+{
+    return f8x2_ocp_t{fp8_impl::cvt_float_to_fp8_scaled<f8_ocp_t::default_interpret>(x, scale)};
+}
+// convert fp32x2 to bf8x2 with rounding to nearest even
+template <>
+inline __host__ __device__ bf8x2_ocp_t mxf8_convert_rne<bf8x2_ocp_t, float2_t>(float2_t x,
+                                                                               float scale)
+{
+    return bf8x2_ocp_t{fp8_impl::cvt_float_to_fp8_scaled<bf8_ocp_t::default_interpret>(x, scale)};
+}
+// convert fp32x16 to fp8x16 with rounding to nearest even
+template <>
+inline __host__ __device__ f8x16_ocp_t mxf8_convert_rne<f8x16_ocp_t, float16_t>(float16_t x,
+                                                                                float scale)
+{
+    union
+    {
+        float16_t float_1x16;
+        float2_t float_2x8[8];
+    } in{x};
+    union
+    {
+        f8x16_ocp_t fp8_1x16;
+        f8x2_ocp_t fp8_2x8[8];
+    } out{};
+    ck::static_for<0, 8, 1>{}(
+        [&](auto i) { out.fp8_2x8[i] = mxf8_convert_rne<f8x2_ocp_t>(in.float_2x8[i], scale); });
+    return out.fp8_1x16;
+}
+// convert fp32x16 to bf8x16 with rounding to nearest even
+template <>
+inline __host__ __device__ bf8x16_ocp_t mxf8_convert_rne<bf8x16_ocp_t, float16_t>(float16_t x,
+                                                                                  float scale)
+{
+    union
+    {
+        float16_t float_1x16;
+        float2_t float_2x8[8];
+    } in{x};
+    union
+    {
+        bf8x16_ocp_t bf8_1x16;
+        bf8x2_ocp_t bf8_2x8[8];
+    } out{};
+    ck::static_for<0, 8, 1>{}(
+        [&](auto i) { out.bf8_2x8[i] = mxf8_convert_rne<bf8x2_ocp_t>(in.float_2x8[i], scale); });
+    return out.bf8_1x16;
+}
+// convert fp32x32 to fp8x32 with rounding to nearest even
+template <>
+inline __host__ __device__ f8x32_ocp_t mxf8_convert_rne<f8x32_ocp_t, float32_t>(float32_t x,
+                                                                                float scale)
+{
+    union
+    {
+        float32_t float_1x32;
+        float16_t float_16x2[2];
+    } in{x};
+    union
+    {
+        f8x32_ocp_t fp8_1x32;
+        f8x16_ocp_t fp8_16x2[2];
+    } out{};
+    ck::static_for<0, 2, 1>{}(
+        [&](auto i) { out.fp8_16x2[i] = mxf8_convert_rne<f8x16_ocp_t>(in.float_16x2[i], scale); });
+    return out.fp8_1x32;
+}
+// convert fp32x32 to bf8x32 with rounding to nearest even
+template <>
+inline __host__ __device__ bf8x32_ocp_t mxf8_convert_rne<bf8x32_ocp_t, float32_t>(float32_t x,
+                                                                                  float scale)
+{
+    union
+    {
+        float32_t float_1x32;
+        float16_t float_16x2[2];
+    } in{x};
+    union
+    {
+        bf8x32_ocp_t bf8_1x32;
+        bf8x16_ocp_t bf8_16x2[2];
+    } out{};
+    ck::static_for<0, 2, 1>{}(
+        [&](auto i) { out.bf8_16x2[i] = mxf8_convert_rne<bf8x16_ocp_t>(in.float_16x2[i], scale); });
+    return out.bf8_1x32;
+}
+// convert fp32 to fp8 with stochastic rounding
+template <>
+inline __host__ __device__ f8_ocp_t mxf8_convert_sr<f8_ocp_t, float>(float x, float scale)
+{
+    return f8_ocp_t{fp8_impl::cvt_float_to_fp8_scaled<f8_ocp_t::default_interpret, true>(x, scale)};
+}
+// convert fp32 to bf8 with stochastic rounding
+template <>
+inline __host__ __device__ bf8_ocp_t mxf8_convert_sr<bf8_ocp_t, float>(float x, float scale)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_float_to_fp8_scaled<bf8_ocp_t::default_interpret, true>(x, scale)};
+}
+// convert fp32x2 to fp8x2 with stochastic rounding
+template <>
+inline __host__ __device__ f8x2_ocp_t mxf8_convert_sr<f8x2_ocp_t, float2_t>(float2_t x, float scale)
+{
+    return f8x2_ocp_t{
+        fp8_impl::cvt_float_to_fp8_scaled<f8_ocp_t::default_interpret, true>(x, scale)};
+}
+// convert fp32x2 to bf8x2 with stochastic rounding
+template <>
+inline __host__ __device__ bf8x2_ocp_t mxf8_convert_sr<bf8x2_ocp_t, float2_t>(float2_t x,
+                                                                              float scale)
+{
+    return bf8x2_ocp_t{
+        fp8_impl::cvt_float_to_fp8_scaled<bf8_ocp_t::default_interpret, true>(x, scale)};
+}
+// convert fp32x16 to fp8x16 with stochastic rounding
+template <>
+inline __host__ __device__ f8x16_ocp_t mxf8_convert_sr<f8x16_ocp_t, float16_t>(float16_t x,
+                                                                               float scale)
+{
+    union
+    {
+        float16_t float_1x16;
+        float2_t float_2x8[8];
+    } in{x};
+    union
+    {
+        f8x16_ocp_t fp8_1x16;
+        f8x2_ocp_t fp8_2x8[8];
+    } out{};
+    ck::static_for<0, 8, 1>{}(
+        [&](auto i) { out.fp8_2x8[i] = mxf8_convert_sr<f8x2_ocp_t>(in.float_2x8[i], scale); });
+    return out.fp8_1x16;
+}
+// convert fp32x16 to bf8x16 with stochastic rounding
+template <>
+inline __host__ __device__ bf8x16_ocp_t mxf8_convert_sr<bf8x16_ocp_t, float16_t>(float16_t x,
+                                                                                 float scale)
+{
+    union
+    {
+        float16_t float_1x16;
+        float2_t float_2x8[8];
+    } in{x};
+    union
+    {
+        bf8x16_ocp_t bf8_1x16;
+        bf8x2_ocp_t bf8_2x8[8];
+    } out{};
+    ck::static_for<0, 8, 1>{}(
+        [&](auto i) { out.bf8_2x8[i] = mxf8_convert_sr<bf8x2_ocp_t>(in.float_2x8[i], scale); });
+    return out.bf8_1x16;
+}
+// convert fp32x32 to fp8x32 with stochastic rounding
+template <>
+inline __host__ __device__ f8x32_ocp_t mxf8_convert_sr<f8x32_ocp_t, float32_t>(float32_t x,
+                                                                               float scale)
+{
+    union
+    {
+        float32_t float_1x32;
+        float16_t float_16x2[2];
+    } in{x};
+    union
+    {
+        f8x32_ocp_t fp8_1x32;
+        f8x16_ocp_t fp8_16x2[2];
+    } out{};
+    ck::static_for<0, 2, 1>{}(
+        [&](auto i) { out.fp8_16x2[i] = mxf8_convert_sr<f8x16_ocp_t>(in.float_16x2[i], scale); });
+    return out.fp8_1x32;
+}
+// convert fp32x32 to bf8x32 with stochastic rounding
+template <>
+inline __host__ __device__ bf8x32_ocp_t mxf8_convert_sr<bf8x32_ocp_t, float32_t>(float32_t x,
+                                                                                 float scale)
+{
+    union
+    {
+        float32_t float_1x32;
+        float16_t float_16x2[2];
+    } in{x};
+    union
+    {
+        bf8x32_ocp_t bf8_1x32;
+        bf8x16_ocp_t bf8_16x2[2];
+    } out{};
+    ck::static_for<0, 2, 1>{}(
+        [&](auto i) { out.bf8_16x2[i] = mxf8_convert_sr<bf8x16_ocp_t>(in.float_16x2[i], scale); });
+    return out.bf8_1x32;
+}
+} // namespace ck
--- a/include/ck/utility/mxfp_utils.hpp
+++ b/include/ck/utility/mxfp_utils.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+namespace ck::utils {
+union cvt
+{
+    float value_float;
+    uint32_t value_bitwise;
+};
+template <typename DTYPE>
+inline bool getDataHasInf()
+{
+    return DTYPE::dataInfo.hasInf;
+}
+template <typename T>
+__host__ __device__ inline bool is_zero(e8m0_bexp_t const scale, T const data);
+template <typename T>
+__host__ __device__ inline bool is_nan(e8m0_bexp_t const scale, T const data);
+template <typename T>
+__host__ __device__ inline bool is_inf(e8m0_bexp_t const scale, T const data);
+template <typename T>
+__host__ __device__ inline int get_exponent_value(T x)
+{
+    x >>= NumericUtils<T>::mant;
+    x &= ((1 << NumericUtils<T>::exp) - 1);
+    return static_cast<int>(x);
+}
+template <typename T>
+__host__ __device__ inline bool is_subnormal(T x)
+{
+    return get_exponent_value<T>(x) == 0;
+}
+template <typename T>
+__host__ __device__ inline double get_mantissa_value(T x)
+{
+    double mantissa = is_subnormal<T>(x) ? 0.0f : 1.0f;
+    for(uint i = 0; i < NumericUtils<T>::mant; i++)
+    {
+        mantissa += std::pow(2, -int32_t((NumericUtils<T>::mant - i))) * (x & 0b1);
+        x >>= 1;
+    }
+    return mantissa;
+}
+template <typename T>
+__host__ __device__ inline bool get_data_has_inf()
+{
+    return NumericUtils<T>::has_inf;
+}
+template <typename T>
+__host__ __device__ float convert_to_float(T data, int scale_exp)
+{
+    float d_sign =
+        std::pow(-1, static_cast<float>(data >> (NumericUtils<T>::exp + NumericUtils<T>::mant)));
+    float d_exp;
+    if(is_subnormal<T>(data))
+        d_exp = std::pow(2, 1 - static_cast<int>(NumericUtils<T>::bias));
+    else
+        d_exp = std::pow(2, get_exponent_value<T>(data) - static_cast<int>(NumericUtils<T>::bias));
+    float d_mant = get_mantissa_value<T>(data);
+    float data_value  = d_sign * d_exp * d_mant;
+    float scale_value = std::pow(
+        2, static_cast<float>((scale_exp - static_cast<int>(NumericUtils<e8m0_bexp_t>::bias))));
+    return data_value * scale_value;
+}
+template <typename T>
+__host__ __device__ inline float to_float(e8m0_bexp_t const scale, T const data);
+template <typename T>
+__host__ __device__ T sat_convert_to_type(float value);
+template <typename T>
+__host__ __device__ T sat_convert_to_type_sr(float value, uint32_t seed);
+template <typename T>
+inline T convert_to_type(float value)
+{
+    using bitwise_type = typename NumericUtils<T>::bitwise_type;
+    if(std::abs(value) > NumericLimits<T>::Max())
+    {
+        float max_value = NumericLimits<T>::Max();
+        cvt t;
+        // cppcheck-suppress redundantAssignment
+        t.value_float        = max_value;
+        uint32_t max_bitwise = t.value_bitwise;
+        // cppcheck-suppress redundantAssignment
+        t.value_float = value;
+        bitwise_type sign =
+            t.value_bitwise >> (NumericUtils<float>::exp + NumericUtils<float>::mant);
+        bitwise_type exp =
+            ((max_bitwise >> NumericUtils<float>::mant) & NumericUtils<float>::exp_mask) -
+            (NumericUtils<float>::bias - NumericUtils<T>::bias);
+        bitwise_type mantissa = max_bitwise >> (NumericUtils<float>::mant - NumericUtils<T>::mant);
+        uint32_t mant_prev = max_bitwise >> (NumericUtils<float>::mant - NumericUtils<T>::mant);
+        mant_prev &= ((1 << NumericUtils<T>::mant) - 1);
+        mant_prev--;
+        mant_prev <<= (NumericUtils<float>::mant - NumericUtils<T>::mant);
+        uint32_t prev_bit =
+            ((max_bitwise >> NumericUtils<float>::mant) << NumericUtils<float>::mant) | mant_prev;
+        t.value_bitwise = prev_bit;
+        float prev_val  = t.value_float;
+        float diff      = max_value - prev_val;
+        float actual_max = max_value + (diff / 2);
+        if(std::abs(value) < actual_max)
+        {
+            return sign << ((NumericUtils<T>::exp + NumericUtils<T>::mant)) |
+                   (exp << NumericUtils<T>::mant) | mantissa;
+        }
+        else
+        {
+            if(!get_data_has_inf<T>())
+            {
+                return (1 << (NumericUtils<T>::mant + NumericUtils<T>::exp)) - 1;
+            }
+            else
+            {
+                exp++;
+                return sign << ((NumericUtils<T>::exp + NumericUtils<T>::mant)) |
+                       (exp << NumericUtils<T>::mant);
+            }
+        }
+    }
+    const int mfmt = NumericUtils<float>::mant;
+    uint32_t x;
+    x = bit_cast<uint32_t>(value);
+    uint32_t head, mantissa;
+    int32_t exponent, bias;
+    uint32_t sign;
+    head     = x & NumericUtils<float>::head_mask;
+    mantissa = x & NumericUtils<float>::mant_mask;
+    exponent = (head >> NumericUtils<float>::mant) & NumericUtils<float>::exp_mask;
+    sign     = head >> (NumericUtils<float>::mant + NumericUtils<float>::exp);
+    bias     = NumericUtils<float>::bias;
+    if(x == 0)
+    {
+        return 0b0;
+    }
+    const int mini_bias                  = NumericUtils<T>::bias;
+    const int mini_denormal_act_exponent = 1 - mini_bias;
+    int act_exponent, out_exponent, exponent_diff;
+    bool is_subnorm = false;
+    if(exponent == 0)
+    {
+        act_exponent  = exponent - bias + 1;
+        exponent_diff = mini_denormal_act_exponent - act_exponent;
+        is_subnorm    = true;
+    }
+    else
+    {
+        act_exponent = exponent - bias;
+        if(act_exponent <= mini_denormal_act_exponent)
+        {
+            exponent_diff = mini_denormal_act_exponent - act_exponent;
+            is_subnorm    = true;
+        }
+        else
+        {
+            exponent_diff = 0;
+        }
+        mantissa += (1UL << mfmt);
+    }
+    auto shift_amount = (mfmt - NumericUtils<T>::mant + exponent_diff);
+    shift_amount      = (shift_amount >= 64) ? 63 : shift_amount;
+    bool midpoint     = (mantissa & ((1UL << shift_amount) - 1)) == (1UL << (shift_amount - 1));
+    float min_subnorm = NumericLimits<T>::DataMinSubnorm() * (sign ? -1 : 1);
+    if(is_subnorm && std::abs(value) < std::abs(min_subnorm))
+    {
+        // closer to 0
+        if(std::abs(value) <= std::abs(min_subnorm - value))
+            return 0;
+        else
+            return 1 | (sign << (NumericUtils<T>::exp + NumericUtils<T>::mant));
+    }
+    if(exponent_diff > 0)
+        mantissa >>= exponent_diff;
+    else if(exponent_diff == -1)
+        mantissa <<= -exponent_diff;
+    bool implicit_one = mantissa & (1 << mfmt);
+    out_exponent      = (act_exponent + exponent_diff) + mini_bias - (implicit_one ? 0 : 1);
+    uint32_t drop_mask = (1UL << (mfmt - NumericUtils<T>::mant)) - 1;
+    bool odd           = mantissa & (1UL << (mfmt - NumericUtils<T>::mant));
+    mantissa += (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa) & drop_mask;
+    if(out_exponent == 0)
+    {
+        if((1UL << mfmt) & mantissa)
+        {
+            out_exponent = 1;
+        }
+    }
+    else
+    {
+        if((1UL << (mfmt + 1)) & mantissa)
+        {
+            mantissa >>= 1;
+            out_exponent++;
+        }
+    }
+    mantissa >>= (mfmt - NumericUtils<T>::mant);
+    if(out_exponent == 0 && mantissa == 0)
+    {
+        return 0;
+    }
+    mantissa &= (1UL << NumericUtils<T>::mant) - 1;
+    return (sign << (NumericUtils<T>::exp + NumericUtils<T>::mant)) |
+           (out_exponent << NumericUtils<T>::mant) | mantissa;
+}
+template <typename T>
+inline T convert_to_type_sr(float value, uint32_t seed)
+{
+    if(std::abs(value) > NumericLimits<T>::Max())
+    {
+        float max_value = NumericLimits<T>::Max();
+        cvt t;
+        // cppcheck-suppress redundantAssignment
+        t.value_float    = max_value;
+        uint max_bitwise = t.value_bitwise;
+        // cppcheck-suppress redundantAssignment
+        t.value_float = value;
+        T sign        = t.value_bitwise >> (NumericUtils<float>::exp + NumericUtils<float>::mant);
+        T exp = ((max_bitwise >> NumericUtils<float>::mant) & NumericUtils<float>::exp_mask) -
+                (NumericUtils<float>::bias - NumericUtils<T>::bias);
+        uint32_t mant_prev = max_bitwise >> (NumericUtils<float>::mant - NumericUtils<T>::mant);
+        mant_prev &= ((1UL << NumericUtils<T>::mant) - 1);
+        mant_prev--;
+        mant_prev <<= (NumericUtils<float>::mant - NumericUtils<T>::mant);
+        uint32_t prev_bit =
+            ((max_bitwise >> NumericUtils<float>::mant) << NumericUtils<float>::mant) | mant_prev;
+        t.value_bitwise = prev_bit;
+        float prev_val  = t.value_float;
+        float diff      = max_value - prev_val;
+        float actual_max = max_value + (diff / 2);
+        if(std::abs(value) < actual_max)
+        {
+            double d_max_value  = static_cast<double>(max_value);
+            double d_actual_max = static_cast<double>(actual_max);
+            double d_value      = static_cast<double>(value);
+            double d_is         = std::abs(d_max_value - d_actual_max);
+            double d_seed       = static_cast<double>(seed);
+            double d_prob = 1.0f - (std::abs(d_value - d_max_value) / d_is); // prob to round down
+            double thresh = UINT_MAX * d_prob;
+            if(!get_data_has_inf<T>() || d_seed <= thresh)
+                // return static_cast<T>(satConvertToType(getDataMax<DTYPE>())); //round down time
+                return sign == 0 ? NumericUtils<f4_t>::data_max_positive_normal_mask
+                                 : NumericUtils<f4_t>::data_max_negative_normal_mask;
+            else
+            {
+                exp++;
+                return sign << ((NumericUtils<T>::exp + NumericUtils<T>::mant)) // inf
+                       | (exp << NumericUtils<T>::mant);
+            }
+        }
+        else
+        {
+            if(!get_data_has_inf<T>())
+                return (1 << (NumericUtils<T>::mant + NumericUtils<T>::exp)) - 1;
+            else
+            {
+                exp++;
+                return sign << ((NumericUtils<T>::exp + NumericUtils<T>::mant)) // inf
+                       | (exp << NumericUtils<T>::mant);
+            }
+        }
+    }
+    uint32_t f32 = bit_cast<uint32_t>(value);
+    auto f32_mant = f32 & NumericUtils<float>::mant_mask;
+    auto head     = f32 & NumericUtils<float>::head_mask;
+    auto f32_exp  = (head >> NumericUtils<float>::mant) & NumericUtils<float>::exp_mask;
+    auto sign_bit = head >> (NumericUtils<float>::mant + NumericUtils<float>::exp);
+    auto sign     = sign_bit << (NumericUtils<T>::exp + NumericUtils<T>::mant);
+    f32_exp      = static_cast<int32_t>(f32_exp) - NumericUtils<float>::bias;
+    int32_t exp  = f32_exp;
+    auto mant    = f32_mant;
+    bool subnorm = false;
+    if(f32 == 0)
+        return 0b0;
+    if(exp >= NumericUtils<T>::unbiased_exp_min)
+    {
+        mant = f32_mant;
+    }
+    // if the exponent bit is 8, then the subnormal is exactly the same as f32
+    else if(exp < NumericUtils<T>::unbiased_exp_min &&
+            NumericUtils<T>::exp < NumericUtils<float>::exp)
+    {
+        subnorm   = true;
+        auto diff = static_cast<uint32_t>(NumericUtils<T>::unbiased_exp_min - exp);
+        if(diff >= 32)
+        {
+            mant     = 0;
+            f32_mant = 0;
+        }
+        else
+        {
+            f32_mant |= static_cast<uint32_t>(1) << NumericUtils<float>::mant;
+            f32_mant >>= diff;
+        }
+        exp  = 0;
+        mant = f32_mant;
+    }
+    uint32_t sr_shift = NumericUtils<T>::sr_shift;
+    // For stochastic-rounding we add the aligned random value to the
+    // mantissa and then truncate (RTZ).
+    mant += seed >> sr_shift;
+    // Increment exponent when mantissa overflows due to rounding
+    if(mant >= static_cast<uint32_t>(1) << NumericUtils<float>::mant)
+        ++exp;
+    mant >>= (NumericUtils<float>::mant - NumericUtils<T>::mant);
+    mant &= ((1 << NumericUtils<T>::mant) - 1);
+    auto biased_exp = static_cast<uint32_t>(exp);
+    if(!subnorm)
+        biased_exp = static_cast<uint32_t>(exp + NumericUtils<T>::bias);
+    biased_exp &= ((1 << NumericUtils<T>::exp) - 1);
+    auto val = sign | biased_exp << NumericUtils<T>::mant | mant;
+    return val;
+}
+} // namespace ck::utils
--- a/include/ck/utility/random_gen.hpp
+++ b/include/ck/utility/random_gen.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
+#include <ck/utility/ignore.hpp>
 #include "ck/ck.hpp"
+#ifdef CK_CODE_GEN_RTC
+using uint8_t  = unsigned char;
+using uint16_t = unsigned short;
+using uint32_t = unsigned int;
+#endif
 namespace ck {
 // Pseudo random number generator
 // version for fp32
-template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<float, T>{}, bool> = false>
+template <typename T, uint32_t seed_t, ck::enable_if_t<std::is_same<float, T>{}, bool> = false>
 __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
 {
    uint32_t x         = *(reinterpret_cast<uint32_t*>(&val));
@@ -25,7 +30,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 }
 // version for fp16
-template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<_Float16, T>{}, bool> = false>
+template <typename T, uint32_t seed_t, ck::enable_if_t<std::is_same<_Float16, T>{}, bool> = false>
 __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
 {
    uint16_t x         = *(reinterpret_cast<uint16_t*>(&val));
@@ -40,15 +45,14 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 }
 // return 0 if data is not fp16 or fp32
-template <
+template <typename T,
-    typename T,
+          uint32_t seed_t,
-    uint32_t seed_t,
+          ck::enable_if_t<!(std::is_same<float, T>{} || std::is_same<_Float16, T>{}), bool> = false>
-    std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<_Float16, T>{}), bool> = false>
 __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
 {
-    std::ignore = id;
+    ck::ignore = id;
-    std::ignore = val;
+    ck::ignore = val;
-    std::ignore = seed;
+    ck::ignore = seed;
    return 0;
 }

--- a/include/ck/utility/scaled_type_convert.hpp
+++ b/include/ck/utility/scaled_type_convert.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/type_convert.hpp"
+#include "ck/utility/mxf8_utils.hpp"
+#ifdef CK_USE_NATIVE_MX_SUPPORT
+#define CK_USE_NATIVE_MX_SUPPORT 1
+#else
+#define CK_USE_NATIVE_MX_SUPPORT 0
+#endif
+namespace ck {
+// Declare a template function for scaled conversion
+template <typename Y, typename X>
+#if CK_USE_OCP_FP8
+__host__ __device__ constexpr Y scaled_type_convert(e8m0_bexp_t scale, X x);
+#else
+__host__ constexpr Y scaled_type_convert(e8m0_bexp_t scale, X x);
+#endif
+// convert f8_ocp_t to fp32
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ float scaled_type_convert<float, f8_ocp_t>(e8m0_bexp_t scale, f8_ocp_t x)
+#else
+inline __host__ float scaled_type_convert<float, f8_ocp_t>(e8m0_bexp_t scale, f8_ocp_t x)
+#endif
+{
+#if CK_MX_FP8_CVT_FAST_PATH
+    return fp8_impl::cast_to_f32_from_f8_scaled<f8_ocp_t::default_interpret>(
+        type_convert<float>(scale), x.data);
+#else
+    return type_convert<float>(scale) * type_convert<float>(x);
+#endif
+}
+// convert bf8_ocp_t to fp32
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ float scaled_type_convert<float, bf8_ocp_t>(e8m0_bexp_t scale,
+                                                                       bf8_ocp_t x)
+#else
+inline __host__ float scaled_type_convert<float, bf8_ocp_t>(e8m0_bexp_t scale, bf8_ocp_t x)
+#endif
+{
+#if CK_MX_FP8_CVT_FAST_PATH
+    return fp8_impl::cast_to_f32_from_f8_scaled<bf8_ocp_t::default_interpret>(
+        type_convert<float>(scale), x.data);
+#else
+    return type_convert<float>(scale) * type_convert<float>(x);
+#endif
+}
+// convert 2 x f8_ocp_t to 2 x fp32
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ float2_t scaled_type_convert<float2_t, f8x2_ocp_t>(e8m0_bexp_t scale,
+                                                                              f8x2_ocp_t x)
+#else
+inline __host__ float2_t scaled_type_convert<float2_t, f8x2_ocp_t>(e8m0_bexp_t scale, f8x2_ocp_t x)
+#endif
+{
+#if CK_MX_FP8_CVT_FAST_PATH
+    return fp8_impl::cast_to_f32x2_from_f8x2_scaled<f8_ocp_t::default_interpret>(
+        type_convert<float>(scale), x.AsType<fp8_impl::fp8x2_storage_t>()[Number<0>{}]);
+#else
+    return float2_t{scaled_type_convert<float>(scale, x.AsType<f8_ocp_t>()[Number<0>{}]),
+                    scaled_type_convert<float>(scale, x.AsType<f8_ocp_t>()[Number<1>{}])};
+#endif
+}
+// convert 2 x bf8_ocp_t to 2 x fp32
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ float2_t scaled_type_convert<float2_t, bf8x2_ocp_t>(e8m0_bexp_t scale,
+                                                                               bf8x2_ocp_t x)
+#else
+inline __host__ float2_t scaled_type_convert<float2_t, bf8x2_ocp_t>(e8m0_bexp_t scale,
+                                                                    bf8x2_ocp_t x)
+#endif
+{
+#if CK_MX_FP8_CVT_FAST_PATH
+    return fp8_impl::cast_to_f32x2_from_f8x2_scaled<bf8_ocp_t::default_interpret>(
+        type_convert<float>(scale), x.AsType<fp8_impl::fp8x2_storage_t>()[Number<0>{}]);
+#else
+    return float2_t{scaled_type_convert<float>(scale, x.AsType<bf8_ocp_t>()[Number<0>{}]),
+                    scaled_type_convert<float>(scale, x.AsType<bf8_ocp_t>()[Number<1>{}])};
+#endif
+}
+// convert 16 x f8_ocp_t to 16 x fp32
+// @note Host version gives compilation error. Requires extra compiler options.
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ float16_t scaled_type_convert<float16_t, f8x16_ocp_t>(e8m0_bexp_t scale,
+                                                                                 f8x16_ocp_t x)
+#else
+inline __host__ float16_t scaled_type_convert<float16_t, f8x16_ocp_t>(e8m0_bexp_t scale,
+                                                                      f8x16_ocp_t x)
+#endif
+{
+    union
+    {
+        f8x16_ocp_t f8_1x16;
+        f8x2_ocp_t f8_2x8[8];
+    } in{x};
+    union
+    {
+        float16_t float_1x16;
+        float2_t float_2x8[8];
+    } out{};
+    ck::static_for<0, 8, 1>{}([&](auto i) {
+        out.float_2x8[i] = scaled_type_convert<float2_t, f8x2_ocp_t>(scale, in.f8_2x8[i]);
+    });
+    return out.float_1x16;
+}
+// convert 16 x bf8_ocp_t to 16 x fp32
+// @note Host version gives compilation error. Requires extra compiler options.
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ float16_t scaled_type_convert<float16_t, bf8x16_ocp_t>(e8m0_bexp_t scale,
+                                                                                  bf8x16_ocp_t x)
+#else
+inline __host__ float16_t scaled_type_convert<float16_t, bf8x16_ocp_t>(e8m0_bexp_t scale,
+                                                                       bf8x16_ocp_t x)
+#endif
+{
+    union
+    {
+        bf8x16_ocp_t bf8_1x16;
+        bf8x2_ocp_t bf8_2x8[8];
+    } in{x};
+    union
+    {
+        float16_t float_1x16;
+        float2_t float_2x8[8];
+    } out{};
+    ck::static_for<0, 8, 1>{}([&](auto i) {
+        out.float_2x8[i] = scaled_type_convert<float2_t, bf8x2_ocp_t>(scale, in.bf8_2x8[i]);
+    });
+    return out.float_1x16;
+}
+// convert 32 x f8_ocp_t to 32 x fp32
+// @note Host version gives compilation error. Requires extra compiler options.
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ float32_t scaled_type_convert<float32_t, f8x32_ocp_t>(e8m0_bexp_t scale,
+                                                                                 f8x32_ocp_t x)
+#else
+inline __host__ float32_t scaled_type_convert<float32_t, f8x32_ocp_t>(e8m0_bexp_t scale,
+                                                                      f8x32_ocp_t x)
+#endif
+{
+    union
+    {
+        f8x32_ocp_t f8_1x32;
+        f8x16_ocp_t f8_16x2[2];
+    } in{x};
+    union
+    {
+        float32_t float_1x32;
+        float16_t float_16x2[2];
+    } out{};
+    ck::static_for<0, 2, 1>{}([&](auto i) {
+        out.float_16x2[i] = scaled_type_convert<float16_t, f8x16_ocp_t>(scale, in.f8_16x2[i]);
+    });
+    return out.float_1x32;
+}
+// convert 32 x bf8_ocp_t to 32 x fp32
+// @note Host version gives compilation error. Requires extra compiler options.
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ float32_t scaled_type_convert<float32_t, bf8x32_ocp_t>(e8m0_bexp_t scale,
+                                                                                  bf8x32_ocp_t x)
+#else
+inline __host__ float32_t scaled_type_convert<float32_t, bf8x32_ocp_t>(e8m0_bexp_t scale,
+                                                                       bf8x32_ocp_t x)
+#endif
+{
+    union
+    {
+        bf8x32_ocp_t bf8_1x32;
+        bf8x16_ocp_t bf8_16x2[2];
+    } in{x};
+    union
+    {
+        float32_t float_1x32;
+        float16_t float_16x2[2];
+    } out{};
+    ck::static_for<0, 2, 1>{}([&](auto i) {
+        out.float_16x2[i] = scaled_type_convert<float16_t, bf8x16_ocp_t>(scale, in.bf8_16x2[i]);
+    });
+    return out.float_1x32;
+}
+// convert fp32 to fp8
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ f8_ocp_t scaled_type_convert<f8_ocp_t, float>(e8m0_bexp_t scale, float x)
+#else
+inline __host__ f8_ocp_t scaled_type_convert<f8_ocp_t, float>(e8m0_bexp_t scale, float x)
+#endif
+{
+#if CK_USE_SR_F8_CONVERSION
+    return mxf8_convert_sr<f8_ocp_t>(x, type_convert<float>(scale));
+#else
+    return mxf8_convert_rne<f8_ocp_t>(x, type_convert<float>(scale));
+#endif
+}
+// convert fp32 to bf8
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ bf8_ocp_t scaled_type_convert<bf8_ocp_t, float>(e8m0_bexp_t scale,
+                                                                           float x)
+#else
+inline __host__ bf8_ocp_t scaled_type_convert<bf8_ocp_t, float>(e8m0_bexp_t scale, float x)
+#endif
+{
+#if CK_USE_SR_F8_CONVERSION
+    return mxf8_convert_sr<bf8_ocp_t>(x, type_convert<float>(scale));
+#else
+    return mxf8_convert_rne<bf8_ocp_t>(x, type_convert<float>(scale));
+#endif
+}
+// convert fp32x2 to fp8x2
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ f8x2_ocp_t scaled_type_convert<f8x2_ocp_t, float2_t>(e8m0_bexp_t scale,
+                                                                                float2_t x)
+#else
+inline __host__ f8x2_ocp_t scaled_type_convert<f8x2_ocp_t, float2_t>(e8m0_bexp_t scale, float2_t x)
+#endif
+{
+#if CK_USE_SR_F8_CONVERSION
+    return mxf8_convert_sr<f8x2_ocp_t>(x, type_convert<float>(scale));
+#else
+    return mxf8_convert_rne<f8x2_ocp_t>(x, type_convert<float>(scale));
+#endif
+}
+// convert fp32x2 to bf8x2
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ bf8x2_ocp_t scaled_type_convert<bf8x2_ocp_t, float2_t>(e8m0_bexp_t scale,
+                                                                                  float2_t x)
+#else
+inline __host__ bf8x2_ocp_t scaled_type_convert<bf8x2_ocp_t, float2_t>(e8m0_bexp_t scale,
+                                                                       float2_t x)
+#endif
+{
+#if CK_USE_SR_F8_CONVERSION
+    return mxf8_convert_sr<bf8x2_ocp_t>(x, type_convert<float>(scale));
+#else
+    return mxf8_convert_rne<bf8x2_ocp_t>(x, type_convert<float>(scale));
+#endif
+}
+// convert fp32x16 to fp8x16
+// @note Host version gives compilation error. Requires extra compiler options.
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ f8x16_ocp_t
+scaled_type_convert<f8x16_ocp_t, float16_t>(e8m0_bexp_t scale, float16_t x)
+#else
+inline __host__ f8x16_ocp_t scaled_type_convert<f8x16_ocp_t, float16_t>(e8m0_bexp_t scale,
+                                                                        float16_t x)
+#endif
+{
+#if CK_USE_SR_F8_CONVERSION
+    return mxf8_convert_sr<f8x16_ocp_t>(x, type_convert<float>(scale));
+#else
+    return mxf8_convert_rne<f8x16_ocp_t>(x, type_convert<float>(scale));
+#endif
+}
+// convert fp32x16 to bf8x16
+// @note Host version gives compilation error. Requires extra compiler options.
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ bf8x16_ocp_t
+scaled_type_convert<bf8x16_ocp_t, float16_t>(e8m0_bexp_t scale, float16_t x)
+#else
+inline __host__ bf8x16_ocp_t scaled_type_convert<bf8x16_ocp_t, float16_t>(e8m0_bexp_t scale,
+                                                                          float16_t x)
+#endif
+{
+#if CK_USE_SR_F8_CONVERSION
+    return mxf8_convert_sr<bf8x16_ocp_t>(x, type_convert<float>(scale));
+#else
+    return mxf8_convert_rne<bf8x16_ocp_t>(x, type_convert<float>(scale));
+#endif
+}
+// convert fp32x32 to fp8x32
+// @note Host version gives compilation error. Requires extra compiler options.
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ f8x32_ocp_t
+scaled_type_convert<f8x32_ocp_t, float32_t>(e8m0_bexp_t scale, float32_t x)
+#else
+inline __host__ f8x32_ocp_t scaled_type_convert<f8x32_ocp_t, float32_t>(e8m0_bexp_t scale,
+                                                                        float32_t x)
+#endif
+{
+#if CK_USE_SR_F8_CONVERSION
+    return mxf8_convert_sr<f8x32_ocp_t>(x, type_convert<float>(scale));
+#else
+    return mxf8_convert_rne<f8x32_ocp_t>(x, type_convert<float>(scale));
+#endif
+}
+// convert fp32x32 to bf8x32
+// @note Host version gives compilation error. Requires extra compiler options.
+template <>
+#if CK_USE_OCP_FP8
+inline __host__ __device__ bf8x32_ocp_t
+scaled_type_convert<bf8x32_ocp_t, float32_t>(e8m0_bexp_t scale, float32_t x)
+#else
+inline __host__ bf8x32_ocp_t scaled_type_convert<bf8x32_ocp_t, float32_t>(e8m0_bexp_t scale,
+                                                                          float32_t x)
+#endif
+{
+#if CK_USE_SR_F8_CONVERSION
+    return mxf8_convert_sr<bf8x32_ocp_t>(x, type_convert<float>(scale));
+#else
+    return mxf8_convert_rne<bf8x32_ocp_t>(x, type_convert<float>(scale));
+#endif
+}
+// activate for architectures with native MX support
+#if CK_USE_NATIVE_MX_SUPPORT
+// convert fp4 to fp32
+template <>
+inline __host__ __device__ float scaled_type_convert<float, f4_t>(e8m0_bexp_t scale, f4_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        float float_array[2];
+        float2_t float2_array;
+    } float_values{};
+    float_values.float2_array =
+        __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(x, type_convert<float>(scale), 0);
+    return float_values.float_array[0];
+#else
+    return utils::to_float<f4_t>(scale, x);
+#endif
+}
+// convert vector of 2 fp4 to vector of 2 fp32
+template <>
+inline __host__ __device__ float2_t scaled_type_convert<float2_t, f4x2_t>(e8m0_bexp_t scale,
+                                                                          f4x2_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        uint32_t bitwise;
+        f4x2_t f4x2_array[4];
+    } value{};
+    value.f4x2_array[0] = x;
+    return __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.bitwise, type_convert<float>(scale), 0);
+#else
+    float2_t ret{utils::to_float<f4_t>(
+                     scale, x.template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{})),
+                 utils::to_float<f4_t>(
+                     scale, x.template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}))};
+    return ret;
+#endif
+}
+// convert vector of 32 fp4 to vector of 32 fp32
+template <>
+inline __host__ __device__ float32_t scaled_type_convert<float32_t, f4x32_t>(e8m0_bexp_t scale,
+                                                                             f4x32_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        f4x32_t f4x32_array;
+        f4x2_t fp4x2[16];
+    } value{x};
+    union
+    {
+        uint32_t bitwise;
+        f4x2_t f4x2_array[4];
+    } bitwise_value{};
+    float2_t op;
+    float32_t ret;
+    // TODO: pack in a loop
+    bitwise_value.f4x2_array[0] = value.fp4x2[0];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[0] = op[0];
+    ret[1] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[1];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[2] = op[0];
+    ret[3] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[2];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[4] = op[0];
+    ret[5] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[3];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[6] = op[0];
+    ret[7] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[4];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[8] = op[0];
+    ret[9] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[5];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[10] = op[0];
+    ret[11] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[6];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[12] = op[0];
+    ret[13] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[7];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[14] = op[0];
+    ret[15] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[8];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[16] = op[0];
+    ret[17] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[9];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[18] = op[0];
+    ret[19] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[10];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[20] = op[0];
+    ret[21] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[11];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[22] = op[0];
+    ret[23] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[12];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[24] = op[0];
+    ret[25] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[13];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[26] = op[0];
+    ret[27] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[14];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[28] = op[0];
+    ret[29] = op[1];
+    bitwise_value.f4x2_array[0] = value.fp4x2[15];
+    op                          = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
+        bitwise_value.bitwise, type_convert<float>(scale), 0);
+    ret[30] = op[0];
+    ret[31] = op[1];
+    return ret;
+#else
+    union
+    {
+        float32_t float32_array;
+        float float_array[32];
+    } float_values{};
+    union
+    {
+        __uint128_t bitwise;
+        f4x2_t f4x2_array[16];
+        f4x32_t f4x32_array;
+    } f4_values{bit_cast<__uint128_t>(x)};
+    // TODO: pack in a loop
+    float_values.float_array[0] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[0].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[1] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[0].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[2] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[1].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[3] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[1].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[4] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[2].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[5] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[2].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[6] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[3].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[7] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[3].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[0] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[4].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[1] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[4].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[2] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[5].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[3] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[5].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[4] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[6].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[5] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[6].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[6] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[7].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[7] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[7].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[0] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[8].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[1] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[8].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[2] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[9].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[3] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[9].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[4] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[10].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[5] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[10].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[6] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[11].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[7] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[11].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[0] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[12].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[1] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[12].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[2] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[13].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[3] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[13].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[4] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[14].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[5] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[14].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    float_values.float_array[6] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[15].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<0>{}));
+    float_values.float_array[7] = utils::to_float<f4_t>(
+        scale,
+        f4_values.f4x2_array[15].template AsType<f4x2_pk_t>()[Number<0>{}].unpack<>(Number<1>{}));
+    return float_values.float32_array;
+#endif
+}
+// convert fp32 to fp4
+template <>
+inline __host__ __device__ f4_t scaled_type_convert<f4_t, float>(e8m0_bexp_t scale, float x)
+{
+#if CK_USE_SR_F4_CONVERSION
+    return f4_convert_sr(x, type_convert<float>(scale));
+#else
+    return f4_convert_rne(x, type_convert<float>(scale));
+#endif
+}
+// convert vector of 2 fp32 to vector of 2 fp4
+template <>
+inline __host__ __device__ f4x2_t scaled_type_convert<f4x2_t, float2_t>(e8m0_bexp_t scale,
+                                                                        float2_t x)
+{
+#if CK_USE_SR_F4_CONVERSION
+    return f4_convert_sr(x, type_convert<float>(scale));
+#else
+    return f4_convert_rne(x, type_convert<float>(scale));
+#endif
+}
+// convert vector of 32 fp32 to vector of 32 fp4
+template <>
+inline __host__ __device__ f4x32_t scaled_type_convert<f4x32_t, float32_t>(e8m0_bexp_t scale,
+                                                                           float32_t x)
+{
+#if CK_USE_SR_F4_CONVERSION
+    return f4_convert_sr(x, type_convert<float>(scale));
+#else
+    return f4_convert_rne(x, type_convert<float>(scale));
+#endif
+}
+/**
+ * @brief Converts a 6-bit floating-point value (f6_t) to a 32-bit float,
+ *        applying the specified scaling factor.
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t) used for f6_t.
+ * @param x     The f6_t value to be converted.
+ * @return      The converted 32-bit float representation of the input.
+ */
+template <>
+inline __host__ __device__ float scaled_type_convert<float, f6_t>(e8m0_bexp_t scale, f6_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        f6x32_t f6_vector;
+        f6_t f6_array[32];
+    } in{x};
+    union
+    {
+        float32_t float_vector;
+        float float_array[32];
+    } out{};
+    out.float_vector =
+        __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(in.f6_vector, type_convert<float>(scale));
+    return out.float_array[0];
+#else
+    return utils::to_float<f6_t>(scale, x);
+#endif
+}
+/**
+ * @brief Converts a vector of 32 6-bit floating-point values (f6x32_t) to a vector of 32 floats,
+ *        applying the specified scaling factor.
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t).
+ * @param x     The f6x32_t vector to be converted.
+ * @return      The converted float vector representation of the input.
+ */
+template <>
+inline __host__ __device__ float32_t scaled_type_convert<float32_t, f6x32_t>(e8m0_bexp_t scale,
+                                                                             f6x32_t x)
+{
+#if defined(__gfx950__)
+    return __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(x, type_convert<float>(scale));
+#else
+    union
+    {
+        f6x32_t f6_vector;
+        f6_t f6_array[32];
+    } in{x};
+    union
+    {
+        float32_t float_vector;
+        float float_array[32];
+    } out{};
+    ck::static_for<0, 32, 1>{}(
+        [&](auto i) { out.float_array[i] = utils::to_float<f6_t>(scale, in.f6_array[i]); });
+    return out.float_vector;
+#endif
+}
+/**
+ * @brief Converts a 6-bit floating-point value (bf6_t) to a 32-bit float,
+ *        applying the specified scaling factor.
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t) used for bf6_t.
+ * @param x     The bf6_t value to be converted.
+ * @return      The converted 32-bit float representation of the input.
+ */
+template <>
+inline __host__ __device__ float scaled_type_convert<float, bf6_t>(e8m0_bexp_t scale, bf6_t x)
+{
+#if defined(__gfx950__)
+    union
+    {
+        bf6x32_t bf6_vector;
+        bf6_t bf6_array[32];
+    } in{x};
+    union
+    {
+        float32_t float_vector;
+        float float_array[32];
+    } out{};
+    out.float_vector =
+        __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(in.bf6_vector, type_convert<float>(scale));
+    return out.float_array[0];
+#else
+    return utils::to_float<bf6_t>(scale, x);
+#endif
+}
+/**
+ * @brief Converts a vector of 6-bit floating-point values (bf6x32_t) to a vector of 32 floats,
+ *        applying the specified scaling factor.
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t).
+ * @param x     The bf6x32_t vector to be converted.
+ * @return      The converted vector of 32 float representation of the input.
+ */
+template <>
+inline __host__ __device__ float32_t scaled_type_convert<float32_t, bf6x32_t>(e8m0_bexp_t scale,
+                                                                              bf6x32_t x)
+{
+#if defined(__gfx950__)
+    return __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(x, type_convert<float>(scale));
+#else
+    union
+    {
+        bf6x32_t bf6_vector;
+        bf6_t bf6_array[32];
+    } in{x};
+    union
+    {
+        float32_t float_vector;
+        float float_array[32];
+    } out{};
+    ck::static_for<0, 32, 1>{}(
+        [&](auto i) { out.float_array[i] = utils::to_float<bf6_t>(scale, in.bf6_array[i]); });
+    return out.float_vector;
+#endif
+}
+/**
+ * @brief Converts a 32-bit float to a 6-bit floating-point value (f6_t), applying the specified
+ * scale.
+ *
+ * Depending on whether CK_USE_SR_F6_CONVERSION is defined, it uses either stochastic rounding
+ * (f6_convert_sr) or round-to-nearest-even (f6_convert_rne).
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t) used for f6_t.
+ * @param x     The float value to convert.
+ * @return      The converted 6-bit floating-point value (f6_t).
+ */
+template <>
+inline __host__ __device__ f6_t scaled_type_convert<f6_t, float>(e8m0_bexp_t scale, float x)
+{
+#if CK_USE_SR_F6_CONVERSION
+    return f6_convert_sr(x, type_convert<float>(scale));
+#else
+    return f6_convert_rne(x, type_convert<float>(scale));
+#endif
+}
+/**
+ * @brief Converts a vector of 32 floats to a vector of 32 6-bit floating-point values (f6x32_t),
+ * applying the specified scale.
+ *
+ * Depending on whether CK_USE_SR_F6_CONVERSION is defined, it uses either stochastic rounding
+ * (f6_convert_sr) or round-to-nearest-even (f6_convert_rne).
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t).
+ * @param x     The float vector to convert.
+ * @return      The converted vector of 6-bit floating-point values (f6x32_t).
+ */
+template <>
+inline __host__ __device__ f6x32_t scaled_type_convert<f6x32_t, float32_t>(e8m0_bexp_t scale,
+                                                                           float32_t x)
+{
+#if CK_USE_SR_F6_CONVERSION
+    return f6_convert_sr(x, type_convert<float>(scale));
+#else
+    return f6_convert_rne(x, type_convert<float>(scale));
+#endif
+}
+/**
+ * @brief Converts a 32-bit float to a 6-bit floating-point value (bf6_t), applying the specified
+ * scale.
+ *
+ * Depending on whether CK_USE_SR_F6_CONVERSION is defined, it uses either stochastic rounding
+ * (bf6_convert_sr) or round-to-nearest-even (bf6_convert_rne).
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t) used for bf6_t.
+ * @param x     The float value to convert.
+ * @return      The converted 6-bit floating-point value (bf6_t).
+ */
+template <>
+inline __host__ __device__ bf6_t scaled_type_convert<bf6_t, float>(e8m0_bexp_t scale, float x)
+{
+#if CK_USE_SR_F6_CONVERSION
+    return bf6_convert_sr(x, type_convert<float>(scale));
+#else
+    return bf6_convert_rne(x, type_convert<float>(scale));
+#endif
+}
+/**
+ * @brief Converts a vector of 32 floats to a vector of 32 6-bit floating-point values (bf6x32_t),
+ * applying the specified scale.
+ *
+ * Depending on whether CK_USE_SR_F6_CONVERSION is defined, it uses either stochastic rounding
+ * (bf6_convert_sr) or round-to-nearest-even (bf6_convert_rne).
+ *
+ * @param scale The exponent scale factor (e8m0_bexp_t).
+ * @param x     The float vector to convert.
+ * @return      The converted 6-bit floating-point vector (bf6x32_t).
+ */
+template <>
+inline __host__ __device__ bf6x32_t scaled_type_convert<bf6x32_t, float32_t>(e8m0_bexp_t scale,
+                                                                             float32_t x)
+{
+#if CK_USE_SR_F6_CONVERSION
+    return bf6_convert_sr(x, type_convert<float>(scale));
+#else
+    return bf6_convert_rne(x, type_convert<float>(scale));
+#endif
+}
+#endif // #if CK_USE_NATIVE_MX_SUPPORT
+} // namespace ck
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
+#ifndef CK_CODE_GEN_RTC
 #include <ostream>
+#endif
 #include "ck/utility/integral_constant.hpp"
 #include "ck/utility/type.hpp"
@@ -900,6 +902,7 @@ using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type;
 } // namespace ck
+#ifndef CK_CODE_GEN_RTC
 template <ck::index_t... Is>
 std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>)
 {
@@ -910,3 +913,4 @@ std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>)
    os << S::At(S::Size() - ck::Number<1>{}).value << "}";
    return os;
 }
+#endif
--- a/include/ck/utility/statically_indexed_array_multi_index.hpp
+++ b/include/ck/utility/statically_indexed_array_multi_index.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
 #define CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
@@ -35,10 +35,9 @@ __host__ __device__ constexpr auto to_multi_index(const T& x)
 // is the alias of the latter. This is because compiler cannot infer the NSize if
 // using MultiIndex<NSize>
 // TODO: how to fix this?
-template <
+template <typename... Ys,
-    typename... Ys,
+          typename X,
-    typename X,
+          enable_if_t<!ck::is_integral<X>::value && !ck::is_floating_point<X>::value, bool> = false>
-    enable_if_t<!std::is_integral<X>::value && !std::is_floating_point<X>::value, bool> = false>
 __host__ __device__ constexpr auto operator+=(Tuple<Ys...>& y, const X& x)
 {
    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
@@ -47,10 +46,9 @@ __host__ __device__ constexpr auto operator+=(Tuple<Ys...>& y, const X& x)
    return y;
 }
-template <
+template <typename... Ys,
-    typename... Ys,
+          typename X,
-    typename X,
+          enable_if_t<!ck::is_integral<X>::value && !ck::is_floating_point<X>::value, bool> = false>
-    enable_if_t<!std::is_integral<X>::value && !std::is_floating_point<X>::value, bool> = false>
 __host__ __device__ constexpr auto operator-=(Tuple<Ys...>& y, const X& x)
 {
    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
@@ -59,10 +57,9 @@ __host__ __device__ constexpr auto operator-=(Tuple<Ys...>& y, const X& x)
    return y;
 }
-template <
+template <typename... Xs,
-    typename... Xs,
+          typename Y,
-    typename Y,
+          enable_if_t<!ck::is_integral<Y>::value && !ck::is_floating_point<Y>::value, bool> = false>
-    enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> = false>
 __host__ __device__ constexpr auto operator+(const Tuple<Xs...>& x, const Y& y)
 {
    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
@@ -73,10 +70,9 @@ __host__ __device__ constexpr auto operator+(const Tuple<Xs...>& x, const Y& y)
    return r;
 }
-template <
+template <typename... Xs,
-    typename... Xs,
+          typename Y,
-    typename Y,
+          enable_if_t<!ck::is_integral<Y>::value && !ck::is_floating_point<Y>::value, bool> = false>
-    enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> = false>
 __host__ __device__ constexpr auto operator-(const Tuple<Xs...>& x, const Y& y)
 {
    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
@@ -87,10 +83,9 @@ __host__ __device__ constexpr auto operator-(const Tuple<Xs...>& x, const Y& y)
    return r;
 }
-template <
+template <typename... Xs,
-    typename... Xs,
+          typename Y,
-    typename Y,
+          enable_if_t<!ck::is_integral<Y>::value && !ck::is_floating_point<Y>::value, bool> = false>
-    enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> = false>
 __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
 {
    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
@@ -104,7 +99,7 @@ __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
 // MultiIndex = scalar * MultiIndex
 template <typename... Xs,
          typename Y,
-          enable_if_t<std::is_integral<Y>::value || std::is_floating_point<Y>::value, bool> = false>
+          enable_if_t<ck::is_integral<Y>::value || ck::is_floating_point<Y>::value, bool> = false>
 __host__ __device__ constexpr auto operator*(Y a, const Tuple<Xs...>& x)
 {
    constexpr index_t NSize = sizeof...(Xs);
@@ -117,7 +112,7 @@ __host__ __device__ constexpr auto operator*(Y a, const Tuple<Xs...>& x)
 // MultiIndex = MultiIndex * scalar
 template <typename... Xs,
          typename Y,
-          enable_if_t<std::is_integral<Y>::value || std::is_floating_point<Y>::value, bool> = false>
+          enable_if_t<ck::is_integral<Y>::value || ck::is_floating_point<Y>::value, bool> = false>
 __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, Y a)
 {
    return a * x;

--- a/include/ck/utility/tuple.hpp
+++ b/include/ck/utility/tuple.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -32,7 +32,7 @@ struct TupleElementKeyData
    template <typename T,
              typename enable_if<!is_same<remove_cvref_t<T>, TupleElementKeyData>::value,
                                 bool>::type = false>
-    __host__ __device__ constexpr TupleElementKeyData(T&& v) : mData(std::forward<T>(v))
+    __host__ __device__ constexpr TupleElementKeyData(T&& v) : mData(ck::forward<T>(v))
    {
    }
@@ -67,7 +67,7 @@ get_tuple_element_data_reference(TupleElementKeyData<Key, Data>&& x)
 template <typename Key, typename Data>
 __host__ __device__ constexpr Data get_tuple_element_data(const TupleElementKeyData<Key, Data>& x)
 {
-    return std::forward(x.mData);
+    return ck::forward(x.mData);
 }
 template <typename Indices, typename... Xs>
@@ -83,13 +83,13 @@ struct TupleImpl<Sequence<Is...>, Xs...> : TupleElementKeyData<TupleElementKey<I
                                     !is_same<remove_cvref_t<Y>, TupleImpl>::value,
                                 bool>::type = false>
    __host__ __device__ constexpr TupleImpl(Y&& y)
-        : TupleElementKeyData<TupleElementKey<Is>, Xs>(std::forward<Y>(y))...
+        : TupleElementKeyData<TupleElementKey<Is>, Xs>(ck::forward<Y>(y))...
    {
    }
    template <typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
    __host__ __device__ constexpr TupleImpl(Ys&&... ys)
-        : TupleElementKeyData<TupleElementKey<Is>, Xs>(std::forward<Ys>(ys))...
+        : TupleElementKeyData<TupleElementKey<Is>, Xs>(ck::forward<Ys>(ys))...
    {
        static_assert(sizeof...(Is) == sizeof...(Xs) && sizeof...(Is) == sizeof...(Ys),
                      "wrong! inconsistent size");
@@ -123,14 +123,14 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
    template <typename Y,
              typename enable_if<sizeof...(Xs) == 1 && !is_same<remove_cvref_t<Y>, Tuple>::value,
                                 bool>::type = false>
-    __host__ __device__ constexpr Tuple(Y&& y) : base(std::forward<Y>(y))
+    __host__ __device__ constexpr Tuple(Y&& y) : base(ck::forward<Y>(y))
    {
    }
    template <typename... Ys,
              typename enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 2, bool>::type =
                  false>
-    __host__ __device__ constexpr Tuple(Ys&&... ys) : base(std::forward<Ys>(ys)...)
+    __host__ __device__ constexpr Tuple(Ys&&... ys) : base(ck::forward<Ys>(ys)...)
    {
    }
@@ -210,7 +210,7 @@ using tuple_element_t = typename tuple_element<I, TTuple>::type;
 template <typename... Xs>
 __host__ __device__ constexpr auto make_tuple(Xs&&... xs)
 {
-    return Tuple<remove_cvref_t<Xs>...>(std::forward<Xs>(xs)...);
+    return Tuple<remove_cvref_t<Xs>...>(ck::forward<Xs>(xs)...);
 }
 // https://en.cppreference.com/w/cpp/utility/tuple/tie