Merge branch 'develop' into feature/use-larger-tile-size-for-chunk-prefill

d783a8cf · Po Yen Chen · 1b130866 · 4cb3d7d7 · d783a8cf · d783a8cf
Commit d783a8cf authored Dec 05, 2024 by Po Yen Chen
20 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
@@ -761,11 +761,11 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
            float time{0.f};

            hip_check_error(
-                hipMemcpyWithStream(dev_gemm_kargs,
-                                    arg.gemm_kernel_args_.data(),
-                                    arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
-                                    hipMemcpyHostToDevice,
-                                    stream_config.stream_id_));
+                hipMemcpyAsync(dev_gemm_kargs,
+                               arg.gemm_kernel_args_.data(),
+                               arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                               hipMemcpyHostToDevice,
+                               stream_config.stream_id_));

            auto preprocess = [&]() {
                hip_check_error(hipMemsetAsync(

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
@@ -940,10 +940,10 @@ struct DeviceGroupedGemmMultipleDXdlCShuffleTileLoop
                             const void* p_host_kernel_args) const
    {
        arg.p_dev_gemm_args_ = p_dev_kernel_args;
-        hip_check_error(hipMemcpy(p_dev_kernel_args,
-                                  p_host_kernel_args,
-                                  GetDeviceKernelArgSize(&arg),
-                                  hipMemcpyHostToDevice));
+        hip_check_error(hipMemcpyAsync(p_dev_kernel_args,
+                                       p_host_kernel_args,
+                                       GetDeviceKernelArgSize(&arg),
+                                       hipMemcpyHostToDevice));
    }

    virtual void SetDeviceKernelArgs(BaseArgument* p_arg,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -557,12 +557,12 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
                }
            }

-            hipGetErrorString(hipMemcpyWithStream(arg.p_workspace_,
-                                                  arg.gemm_desc_kernel_arg_.data(),
-                                                  arg.gemm_desc_kernel_arg_.size() *
-                                                      sizeof(GemmBiasTransKernelArg),
-                                                  hipMemcpyHostToDevice,
-                                                  stream_config.stream_id_));
+            hipGetErrorString(
+                hipMemcpyAsync(arg.p_workspace_,
+                               arg.gemm_desc_kernel_arg_.data(),
+                               arg.gemm_desc_kernel_arg_.size() * sizeof(GemmBiasTransKernelArg),
+                               hipMemcpyHostToDevice,
+                               stream_config.stream_id_));

            float ave_time = 0;


--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -421,11 +421,11 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
            }

            hip_check_error(
-                hipMemcpyWithStream(arg.p_workspace_,
-                                    arg.gemm_kernel_args_.data(),
-                                    arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
-                                    hipMemcpyHostToDevice,
-                                    stream_config.stream_id_));
+                hipMemcpyAsync(arg.p_workspace_,
+                               arg.gemm_kernel_args_.data(),
+                               arg.gemm_kernel_args_.size() * sizeof(GemmTransKernelArg),
+                               hipMemcpyHostToDevice,
+                               stream_config.stream_id_));

            float ave_time = 0;


--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -38,8 +38,7 @@ __global__ void
    // __attribute__((amdgpu_waves_per_eu(1, 1)))
    kernel_gemm_xdl_cshuffle_v3(typename GridwiseGemm::Argument karg)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(

--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -549,8 +549,10 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, f8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, bf8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, fp8_storage_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
        "wrong! not implemented");

@@ -843,8 +845,8 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,

 #else

-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
-        src_wave_buffer_resource, src_thread_addr_offset, 0);
+    vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0)};
    return src_thread_element_valid ? tmp : vector_t(0);
 #endif
 }
@@ -873,8 +875,8 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,

    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
-        src_wave_buffer_resource, src_thread_addr_offset, 0);
+    vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0)};

    return src_thread_element_valid ? tmp : vector_t(customized_value);
 }

--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/random_gen.hpp"
+#include "ck/utility/type.hpp"
+
+#ifdef CK_USE_FNUZ_FP8
+#define CK_USE_FNUZ_FP8 1
+#else
+#define CK_USE_FNUZ_FP8 0
+#endif
+
+#ifdef CK_USE_OCP_FP8
+#define CK_USE_OCP_FP8 1
+#else
+#define CK_USE_OCP_FP8 0
+#endif
+
+namespace ck {
+
+using f8_fnuz_t  = _BitInt(8);
+using bf8_fnuz_t = unsigned _BitInt(8);
+
+#if(defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx1200__) || \
+    defined(__gfx1201__)) &&                                                                     \
+    __HIP_DEVICE_COMPILE__
+#define CK_FP8_CVT_FAST_PATH 1
+#else
+#define CK_FP8_CVT_FAST_PATH 0
+#endif
+
+#if(defined(__gfx1200__) || defined(__gfx1201__)) && __HIP_DEVICE_COMPILE__
+#define CK_OCP_FP8_CVT_FAST_PATH 1
+#else
+#define CK_OCP_FP8_CVT_FAST_PATH 0
+#endif
+
+typedef unsigned char fp8_storage_t;
+
+/**
+ * \brief Describes FP8 interpretation
+ */
+enum class ck_fp8_interpretation_t
+{
+    CK_E4M3_OCP  = 0, // OCP E4M3
+    CK_E5M2_OCP  = 1, // OCP E5M2
+    CK_E4M3_FNUZ = 2, // FP8
+    CK_E5M2_FNUZ = 3, // BF8
+};
+
+/**
+ * \brief Describes saturation behavior
+ */
+enum class ck_saturation_t
+{
+    CK_NOSAT     = 0, // No saturation - replace with NaN or Inf
+    CK_SATFINITE = 1, // Saturate to finite
+};
+
+namespace fp8_impl {
+
+typedef fp8_storage_t fp8x2_storage_t __attribute__((ext_vector_type(2)));
+typedef float float2_t __attribute__((ext_vector_type(2)));
+
+__host__ __device__ static inline constexpr bool fnuz_f8_is_nan(f8_fnuz_t a)
+{
+    return static_cast<unsigned char>(a) == 0x80;
+}
+__host__ __device__ static inline constexpr bool fnuz_bf8_is_nan(bf8_fnuz_t a)
+{
+    return static_cast<unsigned char>(a) == 0x80;
+}
+
+__host__ __device__ static inline constexpr bool ocp_f8_is_nan(fp8_storage_t a)
+{
+    return (a & 0x7f) == 0x7f;
+}
+__host__ __device__ static inline constexpr bool ocp_bf8_is_nan(fp8_storage_t a)
+{
+    return (a & 0x7f) > 0x7c;
+}
+
+// The conversion function is from rocblas
+// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L220
+// This has been modified to handle double types as well
+template <typename T, int wm, int we, bool is_fnuz, bool clip = false>
+__host__ __device__ static inline T cast_from_f8(fp8_storage_t x)
+{
+    constexpr bool is_half   = __hip_internal::is_same<T, _Float16>::value;
+    constexpr bool is_float  = __hip_internal::is_same<T, float>::value;
+    constexpr bool is_double = __hip_internal::is_same<T, double>::value;
+    static_assert(is_half || is_float || is_double, "only half, float and double are supported");
+
+    constexpr int weo = is_half ? 5 : (is_float ? 8 : 11);
+    constexpr int wmo = is_half ? 10 : (is_float ? 23 : 52);
+
+    T fInf, fNegInf, fNaN, fNeg0, fmax, fmin;
+    if constexpr(is_half)
+    {
+        const unsigned short int ihInf    = 0x7C00;
+        const unsigned short int ihNegInf = 0xFC00;
+        const unsigned short int ihNaN    = 0x7C01;
+        const unsigned short int ihNeg0   = 0x8000;
+        /* Max number in e5m2 57344*/
+        const unsigned short int ifmax = 0x7B00;
+        const unsigned short int ifmin = 0xFB00;
+
+        fInf    = bit_cast<_Float16>(ihInf);
+        fNegInf = bit_cast<_Float16>(ihNegInf);
+        fNaN    = bit_cast<_Float16>(ihNaN);
+        fNeg0   = bit_cast<_Float16>(ihNeg0);
+        fmax    = bit_cast<_Float16>(ifmax);
+        fmin    = bit_cast<_Float16>(ifmin);
+    }
+    else if constexpr(is_float)
+    {
+        const unsigned int ifInf    = 0x7F800000;
+        const unsigned int ifNegInf = 0xFF800000;
+        const unsigned int ifNaN    = 0x7F800001;
+        const unsigned int ifNeg0   = 0x80000000;
+        /* Max number in e5m2 57344*/
+        const unsigned int ifmax = 0x47600000;
+        const unsigned int ifmin = 0xC7600000;
+
+        fInf    = bit_cast<float>(ifInf);
+        fNegInf = bit_cast<float>(ifNegInf);
+        fNaN    = bit_cast<float>(ifNaN);
+        fNeg0   = bit_cast<float>(ifNeg0);
+        fmax    = bit_cast<float>(ifmax);
+        fmin    = bit_cast<float>(ifmin);
+    }
+    else if constexpr(is_double)
+    {
+        const unsigned long long ifInf    = 0x7FF0000000000000ull;
+        const unsigned long long ifNegInf = 0xFFF0000000000000ull;
+        const unsigned long long ifNaN    = 0x7FF0000000000001ull;
+        const unsigned long long ifNeg0   = 0x8000000000000000ull;
+        /* Max number in e5m2 57344*/
+        const unsigned long long ifmax = 0x40EC000000000000ull;
+        const unsigned long long ifmin = 0xC0EC000000000000ull;
+
+        fInf    = bit_cast<double>(ifInf);
+        fNegInf = bit_cast<double>(ifNegInf);
+        fNaN    = bit_cast<double>(ifNaN);
+        fNeg0   = bit_cast<double>(ifNeg0);
+        fmax    = bit_cast<double>(ifmax);
+        fmin    = bit_cast<double>(ifmin);
+    }
+
+    if(x == 0)
+    {
+        return 0;
+    }
+
+    unsigned long long sign     = x >> 7;
+    unsigned long long mantissa = x & ((1 << wm) - 1);
+    int exponent                = (x & 0x7F) >> wm;
+    if constexpr(is_fnuz)
+    {
+        if(x == 0x80)
+        {
+            return fNaN;
+        }
+    }
+    else
+    {
+        if(x == 0x80)
+        {
+            return fNeg0;
+        }
+        if constexpr(we == 4)
+        { // e4m3
+            if((x & 0x7F) == 0x7F)
+            {
+                return fNaN;
+            }
+        }
+        else if((x & 0x7C) == 0x7C)
+        { // e5m2
+            if((x & 0x3) == 0)
+            {
+                if constexpr(clip)
+                {
+                    return sign ? fmin : fmax;
+                }
+                return sign ? fNegInf : fInf;
+            }
+            return fNaN;
+        }
+    }
+
+    typename __hip_internal::conditional<
+        sizeof(T) == 2,
+        unsigned short int,
+        typename __hip_internal::conditional<sizeof(T) == 4, unsigned int, unsigned long long>::
+            type>::type retval;
+
+    if constexpr(we == 5 && is_half && !is_fnuz)
+    {
+        retval = x << 8;
+        return bit_cast<T>(retval);
+    }
+
+    const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (is_fnuz ? 1 : 0);
+
+    // subnormal input
+    if(exponent == 0)
+    {
+#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
+        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+        int sh = 1 + __clz(mantissa) - (32 - wm);
+#else
+        int sh = 1 + __builtin_clz(mantissa) - (32 - wm);
+#endif
+        mantissa <<= sh;
+        exponent += 1 - sh;
+        mantissa &= ((1ull << wm) - 1);
+    }
+    exponent += exp_low_cutoff - 1;
+    mantissa <<= wmo - wm;
+
+    // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
+    if(exponent <= 0)
+    {
+        mantissa |= 1 << wmo;
+        mantissa >>= 1 - exponent;
+        exponent = 0;
+    }
+
+    if constexpr(sizeof(T) == 2)
+        retval = (sign << 15) | (exponent << 10) | mantissa;
+    else if constexpr(sizeof(T) == 4)
+        retval = (sign << 31) | (exponent << 23) | mantissa;
+    else
+        retval = (sign << 63) | (static_cast<unsigned long long>(exponent) << 52) | mantissa;
+
+    return bit_cast<T>(retval);
+}
+
+#if CK_FP8_CVT_FAST_PATH
+template <ck_fp8_interpretation_t interpret>
+static __device__ float cast_to_f32_from_f8(fp8_storage_t v)
+{
+    union
+    {
+        unsigned int i32val;
+        unsigned char i8val[4];
+    } val;
+    val.i8val[0] = v;
+
+    static_assert(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ ||
+                      interpret == ck_fp8_interpretation_t::CK_E4M3_OCP ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_FNUZ ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_OCP,
+                  "Only FNUZ and OCP interpretations are supported");
+
+    if constexpr((interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                 (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP))
+    {
+        return __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0);
+    }
+    else
+    {
+        return __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0);
+    }
+}
+
+template <ck_fp8_interpretation_t interpret>
+static __device__ float2_t cast_to_f32x2_from_f8x2(fp8x2_storage_t v)
+{
+    const auto i16val = bit_cast<uint16_t>(v);
+
+    static_assert(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ ||
+                      interpret == ck_fp8_interpretation_t::CK_E4M3_OCP ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_FNUZ ||
+                      interpret == ck_fp8_interpretation_t::CK_E5M2_OCP,
+                  "Only FNUZ and OCP interpretations are supported");
+
+    if constexpr((interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                 (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP))
+    {
+        return __builtin_amdgcn_cvt_pk_f32_fp8(i16val, false);
+    }
+    else
+    {
+        return __builtin_amdgcn_cvt_pk_f32_bf8(i16val, false);
+    }
+}
+
+#endif
+
+} // namespace fp8_impl
+
+struct f8_ocp_t
+{
+    using data_type = fp8_storage_t;
+    data_type data;
+
+    static constexpr ck_saturation_t default_saturation = ck_saturation_t::CK_SATFINITE;
+    static constexpr ck_fp8_interpretation_t default_interpret =
+        ck_fp8_interpretation_t::CK_E4M3_OCP;
+
+    static constexpr unsigned int we = 4; // exponent width
+    static constexpr unsigned int wm = 3; // mantissa width
+
+    __host__ __device__ constexpr bool operator==(const f8_ocp_t& other) const
+    {
+        return (data == other.data) && (fp8_impl::ocp_f8_is_nan(data) == false); // NaN != NaN
+    }
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator float() const
+#else
+    __host__ explicit operator float() const
+#endif
+    {
+#if CK_OCP_FP8_CVT_FAST_PATH
+        return fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data);
+#else
+        return fp8_impl::cast_from_f8<float, wm, we, false>(
+            this->data); // XXX: clip==false must be consistent with operator _Float16
+#endif
+    }
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator _Float16() const
+#else
+    __host__ explicit operator _Float16() const
+#endif
+    {
+#if CK_OCP_FP8_CVT_FAST_PATH
+        return static_cast<_Float16>(fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data));
+#else
+        return fp8_impl::cast_from_f8<_Float16, wm, we, false>(
+            this->data); // XXX: clip==false must be consistent with operator float
+#endif
+    }
+};
+
+struct bf8_ocp_t
+{
+    using data_type = fp8_storage_t;
+    data_type data;
+
+    static constexpr ck_saturation_t default_saturation = ck_saturation_t::CK_SATFINITE;
+    static constexpr ck_fp8_interpretation_t default_interpret =
+        ck_fp8_interpretation_t::CK_E5M2_OCP;
+
+    static constexpr unsigned int we = 5; // exponent width
+    static constexpr unsigned int wm = 2; // mantissa width
+
+    __host__ __device__ constexpr bool operator==(const bf8_ocp_t& other) const
+    {
+        return (data == other.data) && (fp8_impl::ocp_bf8_is_nan(data) == false); // NaN != NaN
+    }
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator float() const
+
+#else
+    __host__ explicit operator float() const
+#endif
+    {
+#if defined(__gfx1200__) || defined(__gfx1201__)
+        return fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data);
+#else
+        return fp8_impl::cast_from_f8<float, wm, we, false>(
+            this->data); // XXX: clip==false must be consistent with operator _Float16
+#endif
+    }
+
+#if CK_USE_OCP_FP8
+    __host__ __device__ explicit operator _Float16() const
+#else
+    __host__ explicit operator _Float16() const
+#endif
+    {
+#if defined(__gfx1200__) || defined(__gfx1201__)
+        return static_cast<_Float16>(fp8_impl::cast_to_f32_from_f8<default_interpret>(this->data));
+#else
+        return fp8_impl::cast_from_f8<_Float16, wm, we, false>(
+            this->data); // XXX: clip==false must be consistent with operator float
+#endif
+    }
+};
+
+template <typename T>
+__host__ __device__ static inline constexpr bool fp8_is_nan(T);
+
+template <>
+__host__ __device__ inline constexpr bool fp8_is_nan(f8_ocp_t a)
+{
+    return fp8_impl::ocp_f8_is_nan(a.data);
+}
+template <>
+__host__ __device__ inline constexpr bool fp8_is_nan(bf8_ocp_t a)
+{
+    return fp8_impl::ocp_bf8_is_nan(a.data);
+}
+template <>
+__host__ __device__ inline constexpr bool fp8_is_nan(f8_fnuz_t a)
+{
+    return fp8_impl::fnuz_f8_is_nan(a);
+}
+template <>
+__host__ __device__ inline constexpr bool fp8_is_nan(bf8_fnuz_t a)
+{
+    return fp8_impl::fnuz_bf8_is_nan(a);
+}
+
+template <typename T,
+          std::enable_if_t<std::is_same_v<T, bf8_ocp_t> || std::is_same_v<T, f8_ocp_t> ||
+                               std::is_same_v<T, bf8_fnuz_t> || std::is_same_v<T, f8_fnuz_t>,
+                           bool> = true>
+__host__ __device__ static inline constexpr bool fp8_is_inf(T)
+{
+    return false;
+}
+template <>
+__host__ __device__ inline constexpr bool fp8_is_inf(bf8_ocp_t a)
+{
+    return (a.data & 0x7f) == 0x7c;
+}
+
+namespace fp8_impl {
+
+// Assertions to check for supported conversion types
+#define __assert_ocp_support(interp)                                               \
+    {                                                                              \
+        if(interp != ck_fp8_interpretation_t::CK_E4M3_OCP &&                       \
+           interp != ck_fp8_interpretation_t::CK_E5M2_OCP)                         \
+        {                                                                          \
+            __hip_assert(false && "type is unsupported by current target device"); \
+        }                                                                          \
+    }
+#define __assert_fnuz_support(interp)                                              \
+    {                                                                              \
+        if(interp != ck_fp8_interpretation_t::CK_E4M3_FNUZ &&                      \
+           interp != ck_fp8_interpretation_t::CK_E5M2_FNUZ)                        \
+        {                                                                          \
+            __hip_assert(false && "type is unsupported by current target device"); \
+        }                                                                          \
+    }
+
+__host__ __device__ static inline void
+__is_interpret_supported([[maybe_unused]] ck_fp8_interpretation_t interp)
+{
+#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
+#if CK_USE_OCP_FP8
+    __assert_ocp_support(interp);
+#endif
+#if CK_USE_FNUZ_FP8
+    __assert_fnuz_support(interp);
+#endif
+#endif
+}
+
+#if CK_FP8_CVT_FAST_PATH
+// The conversion function is from rocblas
+// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_float8.h#L79
+template <ck_fp8_interpretation_t interpret, bool saturate, bool stochastic_rounding = false>
+static __device__ fp8_storage_t cast_to_f8_from_f32(float v, unsigned int rng = 0)
+{
+    fp8_storage_t i8data;
+    union
+    {
+        float fval;
+        unsigned int i32val;
+        unsigned char i8val[4]; // NOTE: not endian independent
+    } val;
+
+    unsigned int ival = 0;
+    val.fval          = v;
+
+    if constexpr(saturate)
+    {
+        if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ)
+        {
+            if((val.i32val & 0x7F800000) != 0x7F800000)
+            { /// propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
+            }
+        }
+        else if constexpr(interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+        { // OCP type
+            if((val.i32val & 0x7F800000) != 0x7F800000)
+            { /// propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 448.0, -448.0);
+            }
+        }
+        else
+        {
+            if((val.i32val & 0x7F800000) != 0x7F800000)
+            { /// propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 57344.0, -57344.0);
+            }
+        }
+    }
+
+    if constexpr(stochastic_rounding)
+    {
+        ival       = (interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                       (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+                         ? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0)
+                         : __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
+        val.i32val = ival;
+        i8data     = val.i8val[0]; // little endian
+    }
+    else
+    { // RNE CVT
+        ival       = (interpret == ck_fp8_interpretation_t::CK_E4M3_FNUZ) ||
+                       (interpret == ck_fp8_interpretation_t::CK_E4M3_OCP)
+                         ? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false)
+                         : __builtin_amdgcn_cvt_pk_bf8_f32(val.fval,
+                                                     val.fval,
+                                                     ival,
+                                                     false); // false -> WORD0
+        val.i32val = ival;
+        i8data     = val.i8val[0];
+    }
+    return i8data;
+}
+#endif // CK_FP8_CVT_FAST_PATH
+
+// The conversion function is from rocblas
+// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L39
+// This has been modified to add double types conversion as well
+template <typename T, int wm, int we, bool is_fnuz, bool clip = false, bool stoch = false>
+__host__ __device__ static inline fp8_storage_t cast_to_f8(T _x, unsigned int rng = 0)
+{
+    constexpr bool is_half   = __hip_internal::is_same<T, _Float16>::value;
+    constexpr bool is_float  = __hip_internal::is_same<T, float>::value;
+    constexpr bool is_double = __hip_internal::is_same<T, double>::value;
+    static_assert(is_half || is_float || is_double,
+                  "Only half, float and double can be cast to f8");
+
+    constexpr int mfmt = (sizeof(T) == 8) ? 52 : ((sizeof(T) == 4) ? 23 : 10);
+
+    using T_bitwise = typename __hip_internal::conditional<
+        sizeof(T) == 2,
+        unsigned short int,
+        typename __hip_internal::conditional<sizeof(T) == 4, unsigned int, unsigned long long>::
+            type>::type;
+    T_bitwise x_bitwise = bit_cast<T_bitwise>(_x);
+
+    unsigned long long x{x_bitwise};
+
+    unsigned long long head, mantissa;
+    int exponent, bias;
+    unsigned int sign;
+    unsigned long long fInf, mask;
+
+    if constexpr(sizeof(T) == 8)
+    {
+        head     = x & 0xFFF0000000000000ull;
+        mantissa = x & 0xFFFFFFFFFFFFFull;
+        exponent = (head >> 52) & 0x7FF;
+        sign     = head >> 63;
+        bias     = 1023;
+        fInf     = 0x7FF0000000000000ull;
+        mask     = 0x7FFFFFFFFFFFFFFFull;
+    }
+    else if constexpr(sizeof(T) == 4)
+    {
+        head     = x & 0xFF800000;
+        mantissa = x & 0x7FFFFF;
+        exponent = (head >> 23) & 0xFF;
+        sign     = head >> 31;
+        bias     = 127;
+        fInf     = 0x7F800000;
+        mask     = 0x7FFFFFFF;
+    }
+    else
+    {
+        head     = x & 0xFC00;
+        mantissa = x & 0x3FF;
+        exponent = (head >> 10) & 0x1F;
+        sign     = head >> 15;
+        bias     = 15;
+        fInf     = 0x7C00;
+        mask     = 0x7FFF;
+    }
+    unsigned int signed_inf = 0;
+    unsigned int nan        = 0;
+    if constexpr(is_fnuz)
+    {
+        signed_inf = clip ? ((sign << 7) + 0x7f) : 0x80;
+        nan        = 0x80;
+    }
+    else
+    {
+        if constexpr(we == 4)
+        { // e4m3
+            signed_inf = (sign << 7) + (clip ? 0x7e : 0x7f);
+        }
+        else
+        { // e5m2
+            signed_inf = (sign << 7) + (clip ? 0x7b : 0x7c);
+        }
+        nan = (sign << 7) + 0x7f;
+    }
+    // Max values
+    unsigned long long ifmax = 0;
+    if constexpr(sizeof(T) == 8)
+    {
+        if constexpr(we == 5)
+        { // 57344
+            ifmax = 0x40EC000000000000ull;
+        }
+        else
+        {
+            if constexpr(is_fnuz)
+            { // 240
+                ifmax = 0x406E000000000000ull;
+            }
+            else
+            { // 448
+                ifmax = 0x407C000000000000ull;
+            }
+        }
+    }
+    else if(sizeof(T) == 4)
+    {
+        if constexpr(we == 5)
+        {
+            ifmax = 0x47600000;
+        }
+        else
+        {
+            if constexpr(is_fnuz)
+            {
+                ifmax = 0x43700000;
+            }
+            else
+            {
+                ifmax = 0x43E00000;
+            }
+        }
+    }
+    else
+    {
+        if constexpr(we == 5)
+        {
+            ifmax = 0x7B00;
+        }
+        else
+        {
+            if constexpr(is_fnuz)
+            {
+                ifmax = 0x5B80;
+            }
+            else
+            {
+                ifmax = 0x5F00;
+            }
+        }
+    }
+    // Deal with inf and NaNs
+    if((x & fInf) == fInf)
+    {
+        if constexpr(is_fnuz)
+            return signed_inf;
+
+        return mantissa != 0 ? nan : signed_inf;
+    }
+
+    if((x & mask) > ifmax)
+    {
+        return signed_inf;
+    }
+
+    if(x == 0)
+    {
+        return 0;
+    }
+
+    // First need to check if it is normal or denorm as there is a difference of
+    // implicit 1 Then need to adjust the exponent to align with the F8 exponent,
+    // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng
+    // to mantissa and truncate. And for RNE, no need to add rng. Then probably
+    // need to check whether there is carry and adjust exponent and mantissa again
+
+    // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent
+    // bits
+    const int f8_bias                  = (1 << (we - 1)) - 1 + (is_fnuz ? 1 : 0);
+    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
+    // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
+    // f8_exponent is the converted f8 exponent with bias encoding
+    // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
+    // the difference needs to be adjusted and mantissa shifted
+    int act_exponent, f8_exponent, exponent_diff;
+
+    if(exponent == 0)
+    { // fp32/fp16 is in denormal.
+        /* fp32 denormal is below 2^-127 so it is usually not a concern here, we
+    mostly concern fp16 here. In this case, f8 is usually in denormal. But there
+    could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has
+    exponent bias 16. It means that there are some numbers in fp16 denormal but they
+    are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
+    where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8
+    (NANOO) normal. In this case, the fp16 mantissa should be shift left by 1  */
+        act_exponent  = exponent - bias + 1;
+        exponent_diff = f8_denormal_act_exponent -
+                        act_exponent; // actual exponent is exponent-bias+1 as it is denormal
+    }
+    else
+    { // fp32/fp16 is normal with implicit 1
+        act_exponent = exponent - bias;
+        if(act_exponent <= f8_denormal_act_exponent)
+        {
+            /* This is the case where fp32/fp16 is normal but it is in f8 denormal
+      range. For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16
+      actual exponent is -7, it is actually larger due to the implicit 1,
+      Therefore it needs to be adjust to -6 and mantissa shift right by 1.
+      So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
+            exponent_diff = f8_denormal_act_exponent - act_exponent;
+        }
+        else
+        {                      // both fp32/fp16 and f8 are in normal range
+            exponent_diff = 0; // exponent_diff=0 does not mean there is no difference
+                               // for this case, act_exponent could be larger. Just
+                               // that it does not need shift mantissa
+        }
+        mantissa += (1ull << mfmt); // Add the implicit 1 into mantissa
+    }
+
+    bool midpoint = (mantissa & ((1ull << (mfmt - wm + exponent_diff)) - 1)) ==
+                    (1ull << (mfmt - wm + exponent_diff - 1));
+    /* This part is a bit tricky. The judgment of whether it is a tie needs to be
+  done before we shift right as shift right could rip off some residual part and
+  make something not midpoint look like midpoint. For example, the fp16 number
+  0x1002 (0 00100 0000000010), it is larger than midpoint, but after shift right
+  by 4 bits, it would look like midpoint.
+  */
+
+    if(exponent_diff > 0)
+        mantissa >>= exponent_diff;
+    else if(exponent_diff == -1)
+        mantissa <<= -exponent_diff;
+    bool implicit_one = mantissa & (1ull << mfmt);
+    // if there is no implicit 1, it  means the f8 is denormal and need to adjust
+    // to denorm exponent
+    f8_exponent =
+        (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
+
+    // Now we have the exponent and mantissa adjusted
+    unsigned long long drop_mask = (1ull << (mfmt - wm)) - 1;
+    bool odd =
+        mantissa & (1ull << (mfmt - wm)); // if the least significant bit that is not truncated is 1
+    mantissa +=
+        (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1ull) : mantissa)) & drop_mask;
+
+    // Now we deal with overflow
+    if(f8_exponent == 0)
+    {
+        if((1ull << mfmt) & mantissa)
+        {
+            f8_exponent = 1; // denormal overflow to become normal, promote exponent
+        }
+    }
+    else
+    {
+        if((1ull << (mfmt + 1)) & mantissa)
+        {
+            mantissa >>= 1;
+            f8_exponent++;
+        }
+    }
+
+    mantissa >>= (mfmt - wm);
+
+    // above range: quantize to maximum possible float of the same sign
+    const int max_exp = (1 << we) - 1;
+    if(f8_exponent > max_exp)
+    {
+        if constexpr(clip)
+        {
+            mantissa    = (1 << wm) - 1;
+            f8_exponent = max_exp;
+        }
+        else
+        {
+            return signed_inf;
+        }
+    }
+
+    if(f8_exponent == 0 && mantissa == 0)
+        return is_fnuz ? 0 : (sign << 7);
+    mantissa &= (1 << wm) - 1;
+    return (sign << 7) | (f8_exponent << wm) | mantissa;
+}
+
+/**
+ * \brief convert float to @p fp8_storage_t
+ *
+ * \tparam interp interpretation of fp8
+ * \tparam sat saturation of fp8
+ * \param f float number
+ * \return fp8_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH
+__host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
+{
+    __is_interpret_supported(interp);
+    uint32_t rng = 0;
+    if constexpr(stochastic_rounding)
+    {
+        constexpr int seed = 1254739;
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+    }
+    return cast_to_f8_from_f32<interp, sat == ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
+        f, rng);
+#else
+#if CK_USE_OCP_FP8
+__host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
+{
+#else
+__host__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
+{
+#endif
+    uint32_t rng = 0;
+    if constexpr(stochastic_rounding)
+    {
+        constexpr int seed = 1254739;
+        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+    }
+
+    if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_FNUZ)
+    {
+        return cast_to_f8<float,
+                          3,
+                          4,
+                          true,
+                          sat == ck_saturation_t::CK_SATFINITE,
+                          stochastic_rounding>(f, rng);
+    }
+    else if constexpr(interp == ck_fp8_interpretation_t::CK_E5M2_FNUZ)
+    {
+        return cast_to_f8<float,
+                          2,
+                          5,
+                          true,
+                          sat == ck_saturation_t::CK_SATFINITE,
+                          stochastic_rounding>(f, rng);
+    }
+    else if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_OCP)
+    {
+        return cast_to_f8<float,
+                          3,
+                          4,
+                          false,
+                          sat == ck_saturation_t::CK_SATFINITE,
+                          stochastic_rounding>(f, rng);
+    }
+    else if constexpr(interp == ck_fp8_interpretation_t::CK_E5M2_OCP)
+    {
+        return cast_to_f8<float,
+                          2,
+                          5,
+                          false,
+                          sat == ck_saturation_t::CK_SATFINITE,
+                          stochastic_rounding>(f, rng);
+    }
+    else
+    {
+        __hip_assert(false && "FP8 type is not supported by current target device");
+        return 0;
+    }
+#endif // CK_FP8_CVT_FAST_PATH
+}
+
+/**
+ * \brief convert _Float16 to @p fp8_storage_t
+ *
+ * \tparam sat saturation of fp8
+ * \tparam interp interpretation of fp8
+ * \tparam stochastic_rounding switch between RNE and SR
+ * \param x _Float16 value
+ * \return fp8_storage_t
+ */
+template <ck_fp8_interpretation_t interp,
+          ck_saturation_t sat      = ck_saturation_t::CK_SATFINITE,
+          bool stochastic_rounding = false>
+#if CK_FP8_CVT_FAST_PATH || CK_USE_OCP_FP8
+__host__ __device__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x)
+#else
+__host__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x)
+#endif
+{
+    return cvt_float_to_fp8<interp, sat, stochastic_rounding>(static_cast<float>(x));
+}
+
+} // namespace fp8_impl
+
+// Declare a template function for fp8 conversion using RNE
+template <typename Y, typename X>
+__host__ __device__ constexpr Y f8_convert_rne(X x);
+
+// convert fp32 to fp8 with rounding to nearest even
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, float>(float x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+// convert fp32 to bf8 with rounding to nearest even
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, float>(float x)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(x)};
+}
+
+// convert _Float16 to fp8 with rounding to nearest even
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_rne<f8_ocp_t, _Float16>(_Float16 x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation>(x)};
+}
+
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_rne<bf8_ocp_t, _Float16>(_Float16 x)
+{
+    return bf8_ocp_t{
+        fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret, bf8_ocp_t::default_saturation>(
+            x)};
+}
+
+// Declare a template function for fp8 conversion using RNE
+template <typename Y, typename X>
+__host__ __device__ constexpr Y f8_convert_sr(X x);
+
+// convert fp32 to fp8 with stochastic rounding
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, float>(float x)
+{
+    return f8_ocp_t{
+        fp8_impl::cvt_float_to_fp8<f8_ocp_t::default_interpret, f8_ocp_t::default_saturation, true>(
+            x)};
+}
+
+// convert fp32 to bf8 with stochastic rounding
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, float>(float x)
+{
+    return bf8_ocp_t{fp8_impl::cvt_float_to_fp8<bf8_ocp_t::default_interpret,
+                                                bf8_ocp_t::default_saturation,
+                                                true>(x)};
+}
+
+// convert _Float16 to fp8 with stochastic rounding
+template <>
+inline __host__ __device__ f8_ocp_t f8_convert_sr<f8_ocp_t, _Float16>(_Float16 x)
+{
+    return f8_ocp_t{fp8_impl::cvt_half_t_to_fp8<f8_ocp_t::default_interpret,
+                                                f8_ocp_t::default_saturation,
+                                                true>(x)};
+}
+
+// convert _Float16 to bf8 with stochastic rounding
+template <>
+inline __host__ __device__ bf8_ocp_t f8_convert_sr<bf8_ocp_t, _Float16>(_Float16 x)
+{
+    return bf8_ocp_t{fp8_impl::cvt_half_t_to_fp8<bf8_ocp_t::default_interpret,
+                                                 bf8_ocp_t::default_saturation,
+                                                 true>(x)};
+}
+
+#if CK_USE_OCP_FP8
+using f8_t  = f8_ocp_t;
+using bf8_t = bf8_ocp_t;
+#define CK_FP8_TYPE_FNUZ 0
+#define CK_FP8_TYPE_OCP 1
+#else
+using f8_t = f8_fnuz_t;
+using bf8_t = bf8_fnuz_t;
+#define CK_FP8_TYPE_FNUZ 1
+#define CK_FP8_TYPE_OCP 0
+#endif
+
+} // namespace ck
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -4,7 +4,7 @@
 #pragma once

 namespace ck {
-// Define the common macro for gfx94x models
+// Define the common macro for MI300 models
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 #define __gfx94__
 #endif

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -3,6 +3,7 @@

 #pragma once

+#include "ck/utility/amd_ck_fp8.hpp"
 #include "ck/utility/statically_indexed_array.hpp"

 namespace ck {
@@ -10,8 +11,6 @@ namespace ck {
 using bhalf_t = ushort;
 using half_t  = _Float16;
 using int4_t  = _BitInt(4);
-using f8_t    = _BitInt(8);
-using bf8_t   = unsigned _BitInt(8);

 inline constexpr auto next_pow2(uint32_t x)
 {
@@ -19,14 +18,15 @@ inline constexpr auto next_pow2(uint32_t x)
    return x > 1u ? (1u << (32u - __builtin_clz(x - 1u))) : x;
 }

-// native types: double, float, _Float16, ushort, int32_t, int8_t, uint8_t, f8_t, bf8_t, bool
+// native types: double, float, _Float16, ushort, int32_t, int8_t, uint8_t, f8_fnuz_t, bf8_fnuz_t,
+// native types: bool
 template <typename T>
 inline constexpr bool is_native_type()
 {
    return is_same<T, double>::value || is_same<T, float>::value || is_same<T, half_t>::value ||
           is_same<T, bhalf_t>::value || is_same<T, int32_t>::value || is_same<T, int8_t>::value ||
-           is_same<T, uint8_t>::value || is_same<T, f8_t>::value || is_same<T, bf8_t>::value ||
-           is_same<T, bool>::value;
+           is_same<T, uint8_t>::value || is_same<T, f8_fnuz_t>::value ||
+           is_same<T, bf8_fnuz_t>::value || is_same<T, bool>::value;
 }

 // vector_type
@@ -166,16 +166,30 @@ struct scalar_type<int4_t>
 #endif

 template <>
-struct scalar_type<f8_t>
+struct scalar_type<f8_fnuz_t>
 {
-    using type                           = f8_t;
+    using type                           = f8_fnuz_t;
    static constexpr index_t vector_size = 1;
 };

 template <>
-struct scalar_type<bf8_t>
+struct scalar_type<bf8_fnuz_t>
 {
-    using type                           = bf8_t;
+    using type                           = bf8_fnuz_t;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<f8_ocp_t>
+{
+    using type                           = f8_ocp_t::data_type;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<bf8_ocp_t>
+{
+    using type                           = bf8_ocp_t::data_type;
    static constexpr index_t vector_size = 1;
 };

@@ -1010,60 +1024,203 @@ struct vector_type<T, 256, typename std::enable_if_t<is_native_type<T>()>>
    }
 };

+template <typename T, index_t N, typename Enable = void>
+struct non_native_vector_base;
+
+template <typename T>
+struct nnvb_data_t_selector
+{
+    using type = unsigned _BitInt(8 * sizeof(T));
+};
+
+template <>
+struct nnvb_data_t_selector<f8_ocp_t>
+{
+    using type = f8_ocp_t::data_type;
+};
+template <>
+struct nnvb_data_t_selector<bf8_ocp_t>
+{
+    using type = bf8_ocp_t::data_type;
+};
+
+template <typename T, index_t N>
+struct non_native_vector_base<
+    T,
+    N,
+    std::enable_if_t<sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8>>
+{
+    using data_t = typename nnvb_data_t_selector<T>::type; // select data_t based on the size of T
+    static_assert(sizeof(T) == sizeof(data_t), "non_native_vector_base storage size mismatch");
+    using data_v = data_t __attribute__((ext_vector_type(N)));
+    using type   = non_native_vector_base<T, N>;
+
+    union alignas(next_pow2(N * sizeof(T)))
+    {
+        data_v dN; // storage vector;
+        StaticallyIndexedArray<data_t, N> dxN;
+        StaticallyIndexedArray<T, N> dTxN;
+        StaticallyIndexedArray<data_v, 1> dNx1;
+    } data_;
+
+    __host__ __device__ constexpr non_native_vector_base(data_t a) : data_{data_v(a)} {}
+    __host__ __device__ constexpr non_native_vector_base(T f)
+        : non_native_vector_base(bit_cast<data_t>(f))
+    {
+    }
+    __host__ __device__ constexpr non_native_vector_base() : non_native_vector_base(T{}){};
+    __host__ __device__ constexpr non_native_vector_base(data_v v) : data_{v} {}
+
+    __host__ __device__ constexpr operator data_v() const { return data_.dN; }
+    __host__ __device__ constexpr operator data_t() const
+    {
+        if constexpr(N == 1)
+        {
+            return data_.dxN[Number<0>{}];
+        }
+        else
+        {
+            return data_.dxN; // XXX this should cause an error
+        }
+    }
+    __host__ __device__ constexpr operator T() const
+    {
+        if constexpr(N == 1)
+        {
+            return data_.dTxN[Number<0>{}];
+        }
+        else
+        {
+            return data_.dTxN; // XXX this should cause an error
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same_v<X, data_t> || is_same_v<X, T> || is_same_v<X, data_v>,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same_v<X, data_t>)
+        {
+            return data_.dxN;
+        }
+        else if constexpr(is_same_v<X, T>)
+        {
+            return data_.dTxN;
+        }
+        else if constexpr(is_same_v<X, data_v>)
+        {
+            return data_.dNx1;
+        }
+        else
+        {
+            return err;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same_v<X, data_t> || is_same_v<X, T> || is_same_v<X, data_v>,
+                      "Something went wrong, please check src and dst types.");
+
+        if constexpr(is_same_v<X, data_t>)
+        {
+            return data_.dxN;
+        }
+        else if constexpr(is_same_v<X, T>)
+        {
+            return data_.dTxN;
+        }
+        else if constexpr(is_same_v<X, data_v>)
+        {
+            return data_.dNx1;
+        }
+        else
+        {
+            return err;
+        }
+    }
+};
+
 template <typename T, index_t N>
-struct non_native_vector_base
+struct scalar_type<non_native_vector_base<T, N>>;
+
+template <index_t N>
+struct scalar_type<non_native_vector_base<f8_ocp_t, N>>
 {
-    using type = non_native_vector_base<T, N>;
+    using type = typename non_native_vector_base<f8_ocp_t, N>::data_t;
+
+    static constexpr index_t vector_size = N;
+};

-    __host__ __device__ non_native_vector_base()            = default;
-    __host__ __device__ non_native_vector_base(const type&) = default;
-    __host__ __device__ non_native_vector_base(type&&)      = default;
-    __host__ __device__ ~non_native_vector_base()           = default;
+template <index_t N>
+struct scalar_type<non_native_vector_base<bf8_ocp_t, N>>
+{
+    using type = typename non_native_vector_base<bf8_ocp_t, N>::data_t;

-    T d[N];
+    static constexpr index_t vector_size = N;
 };

 // non-native vector_type implementation
 template <typename T>
 struct vector_type<T, 1, typename std::enable_if_t<!is_native_type<T>()>>
 {
-    using d1_t = T;
-    using type = d1_t;
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using type     = d1_nnv_t;

    union alignas(next_pow2(1 * sizeof(T)))
    {
        d1_t d1_;
        StaticallyIndexedArray<d1_t, 1> d1x1_;
+        d1_nnv_t d1_nnv_;
    } data_;

-    __host__ __device__ constexpr vector_type() : data_{type{}} {}
+    __host__ __device__ constexpr vector_type() : data_{d1_t{}} {}

    __host__ __device__ constexpr vector_type(type v) : data_{v} {}

    template <typename X>
    __host__ __device__ constexpr const auto& AsType() const
    {
-        static_assert(is_same<X, d1_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value,
                      "Something went wrong, please check src and dst types.");

-        return data_.d1x1_;
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x1_;
+        }
+        else
+        {
+            return err;
+        }
    }

    template <typename X>
    __host__ __device__ constexpr auto& AsType()
    {
-        static_assert(is_same<X, d1_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value,
                      "Something went wrong, please check src and dst types.");

-        return data_.d1x1_;
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
+        {
+            return data_.d1x1_;
+        }
+        else
+        {
+            return err;
+        }
    }
 };

 template <typename T>
 struct vector_type<T, 2, typename std::enable_if_t<!is_native_type<T>()>>
 {
-    using d1_t = T;
-    using d2_t = non_native_vector_base<T, 2>;
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using d2_t     = non_native_vector_base<T, 2>;

    using type = d2_t;

@@ -1081,10 +1238,11 @@ struct vector_type<T, 2, typename std::enable_if_t<!is_native_type<T>()>>
    template <typename X>
    __host__ __device__ constexpr const auto& AsType() const
    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value,
                      "Something went wrong, please check src and dst types.");

-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
        {
            return data_.d1x2_;
        }
@@ -1101,10 +1259,11 @@ struct vector_type<T, 2, typename std::enable_if_t<!is_native_type<T>()>>
    template <typename X>
    __host__ __device__ constexpr auto& AsType()
    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value,
                      "Something went wrong, please check src and dst types.");

-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
        {
            return data_.d1x2_;
        }
@@ -1122,9 +1281,10 @@ struct vector_type<T, 2, typename std::enable_if_t<!is_native_type<T>()>>
 template <typename T>
 struct vector_type<T, 4, typename std::enable_if_t<!is_native_type<T>()>>
 {
-    using d1_t = T;
-    using d2_t = non_native_vector_base<T, 2>;
-    using d4_t = non_native_vector_base<T, 4>;
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using d2_t     = non_native_vector_base<T, 2>;
+    using d4_t     = non_native_vector_base<T, 4>;

    using type = d4_t;

@@ -1143,10 +1303,11 @@ struct vector_type<T, 4, typename std::enable_if_t<!is_native_type<T>()>>
    template <typename X>
    __host__ __device__ constexpr const auto& AsType() const
    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value,
                      "Something went wrong, please check src and dst types.");

-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
        {
            return data_.d1x4_;
        }
@@ -1167,10 +1328,11 @@ struct vector_type<T, 4, typename std::enable_if_t<!is_native_type<T>()>>
    template <typename X>
    __host__ __device__ constexpr auto& AsType()
    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value,
                      "Something went wrong, please check src and dst types.");

-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
        {
            return data_.d1x4_;
        }
@@ -1192,10 +1354,11 @@ struct vector_type<T, 4, typename std::enable_if_t<!is_native_type<T>()>>
 template <typename T>
 struct vector_type<T, 8, typename std::enable_if_t<!is_native_type<T>()>>
 {
-    using d1_t = T;
-    using d2_t = non_native_vector_base<T, 2>;
-    using d4_t = non_native_vector_base<T, 4>;
-    using d8_t = non_native_vector_base<T, 8>;
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using d2_t     = non_native_vector_base<T, 2>;
+    using d4_t     = non_native_vector_base<T, 4>;
+    using d8_t     = non_native_vector_base<T, 8>;

    using type = d8_t;

@@ -1215,11 +1378,12 @@ struct vector_type<T, 8, typename std::enable_if_t<!is_native_type<T>()>>
    template <typename X>
    __host__ __device__ constexpr const auto& AsType() const
    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value,
                      "Something went wrong, please check src and dst types.");

-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
        {
            return data_.d1x8_;
        }
@@ -1244,11 +1408,12 @@ struct vector_type<T, 8, typename std::enable_if_t<!is_native_type<T>()>>
    template <typename X>
    __host__ __device__ constexpr auto& AsType()
    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value,
                      "Something went wrong, please check src and dst types.");

-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
        {
            return data_.d1x8_;
        }
@@ -1274,11 +1439,12 @@ struct vector_type<T, 8, typename std::enable_if_t<!is_native_type<T>()>>
 template <typename T>
 struct vector_type<T, 16, typename std::enable_if_t<!is_native_type<T>()>>
 {
-    using d1_t  = T;
-    using d2_t  = non_native_vector_base<T, 2>;
-    using d4_t  = non_native_vector_base<T, 4>;
-    using d8_t  = non_native_vector_base<T, 8>;
-    using d16_t = non_native_vector_base<T, 16>;
+    using d1_t     = T;
+    using d1_nnv_t = non_native_vector_base<T, 1>;
+    using d2_t     = non_native_vector_base<T, 2>;
+    using d4_t     = non_native_vector_base<T, 4>;
+    using d8_t     = non_native_vector_base<T, 8>;
+    using d16_t    = non_native_vector_base<T, 16>;

    using type = d16_t;

@@ -1299,12 +1465,12 @@ struct vector_type<T, 16, typename std::enable_if_t<!is_native_type<T>()>>
    template <typename X>
    __host__ __device__ constexpr const auto& AsType() const
    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value || is_same<X, d16_t>::value,
                      "Something went wrong, please check src and dst types.");

-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
        {
            return data_.d1x16_;
        }
@@ -1333,12 +1499,12 @@ struct vector_type<T, 16, typename std::enable_if_t<!is_native_type<T>()>>
    template <typename X>
    __host__ __device__ constexpr auto& AsType()
    {
-        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
-                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
-                          is_same<X, d16_t>::value,
+        static_assert(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value ||
+                          is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                          is_same<X, d8_t>::value || is_same<X, d16_t>::value,
                      "Something went wrong, please check src and dst types.");

-        if constexpr(is_same<X, d1_t>::value)
+        if constexpr(is_same<X, d1_t>::value || is_same<X, d1_nnv_t>::value)
        {
            return data_.d1x16_;
        }
@@ -1632,20 +1798,70 @@ using int8x32_t = typename vector_type<int8_t, 32>::type;
 using int8x64_t = typename vector_type<int8_t, 64>::type;

 // f8
-using f8x2_t  = typename vector_type<f8_t, 2>::type;
-using f8x4_t  = typename vector_type<f8_t, 4>::type;
-using f8x8_t  = typename vector_type<f8_t, 8>::type;
-using f8x16_t = typename vector_type<f8_t, 16>::type;
-using f8x32_t = typename vector_type<f8_t, 32>::type;
-using f8x64_t = typename vector_type<f8_t, 64>::type;
+using f8x2_fnuz_t  = typename vector_type<f8_fnuz_t, 2>::type;
+using f8x4_fnuz_t  = typename vector_type<f8_fnuz_t, 4>::type;
+using f8x8_fnuz_t  = typename vector_type<f8_fnuz_t, 8>::type;
+using f8x16_fnuz_t = typename vector_type<f8_fnuz_t, 16>::type;
+using f8x32_fnuz_t = typename vector_type<f8_fnuz_t, 32>::type;
+using f8x64_fnuz_t = typename vector_type<f8_fnuz_t, 64>::type;

 // bf8
-using bf8x2_t  = typename vector_type<bf8_t, 2>::type;
-using bf8x4_t  = typename vector_type<bf8_t, 4>::type;
-using bf8x8_t  = typename vector_type<bf8_t, 8>::type;
-using bf8x16_t = typename vector_type<bf8_t, 16>::type;
-using bf8x32_t = typename vector_type<bf8_t, 32>::type;
-using bf8x64_t = typename vector_type<bf8_t, 64>::type;
+using bf8x2_fnuz_t  = typename vector_type<bf8_fnuz_t, 2>::type;
+using bf8x4_fnuz_t  = typename vector_type<bf8_fnuz_t, 4>::type;
+using bf8x8_fnuz_t  = typename vector_type<bf8_fnuz_t, 8>::type;
+using bf8x16_fnuz_t = typename vector_type<bf8_fnuz_t, 16>::type;
+using bf8x32_fnuz_t = typename vector_type<bf8_fnuz_t, 32>::type;
+using bf8x64_fnuz_t = typename vector_type<bf8_fnuz_t, 64>::type;
+
+// f8
+using f8x2_ocp_t  = typename vector_type<f8_ocp_t, 2>::type;
+using f8x4_ocp_t  = typename vector_type<f8_ocp_t, 4>::type;
+using f8x8_ocp_t  = typename vector_type<f8_ocp_t, 8>::type;
+using f8x16_ocp_t = typename vector_type<f8_ocp_t, 16>::type;
+using f8x32_ocp_t = typename vector_type<f8_ocp_t, 32>::type;
+using f8x64_ocp_t = typename vector_type<f8_ocp_t, 64>::type;
+
+// bf8
+using bf8x2_ocp_t  = typename vector_type<bf8_ocp_t, 2>::type;
+using bf8x4_ocp_t  = typename vector_type<bf8_ocp_t, 4>::type;
+using bf8x8_ocp_t  = typename vector_type<bf8_ocp_t, 8>::type;
+using bf8x16_ocp_t = typename vector_type<bf8_ocp_t, 16>::type;
+using bf8x32_ocp_t = typename vector_type<bf8_ocp_t, 32>::type;
+using bf8x64_ocp_t = typename vector_type<bf8_ocp_t, 64>::type;
+
+#if CK_FP8_TYPE_OCP
+// f8
+using f8x2_t  = f8x2_ocp_t;
+using f8x4_t  = f8x4_ocp_t;
+using f8x8_t  = f8x8_ocp_t;
+using f8x16_t = f8x16_ocp_t;
+using f8x32_t = f8x32_ocp_t;
+using f8x64_t = f8x64_ocp_t;
+
+// bf8
+using bf8x2_t  = bf8x2_ocp_t;
+using bf8x4_t  = bf8x4_ocp_t;
+using bf8x8_t  = bf8x8_ocp_t;
+using bf8x16_t = bf8x16_ocp_t;
+using bf8x32_t = bf8x32_ocp_t;
+using bf8x64_t = bf8x64_ocp_t;
+#elif CK_FP8_TYPE_FNUZ
+// f8
+using f8x2_t  = f8x2_fnuz_t;
+using f8x4_t  = f8x4_fnuz_t;
+using f8x8_t  = f8x8_fnuz_t;
+using f8x16_t = f8x16_fnuz_t;
+using f8x32_t = f8x32_fnuz_t;
+using f8x64_t = f8x64_fnuz_t;
+
+// bf8
+using bf8x2_t  = bf8x2_fnuz_t;
+using bf8x4_t  = bf8x4_fnuz_t;
+using bf8x8_t  = bf8x8_fnuz_t;
+using bf8x16_t = bf8x16_fnuz_t;
+using bf8x32_t = bf8x32_fnuz_t;
+using bf8x64_t = bf8x64_fnuz_t;
+#endif

 // u8
 using uint8x2_t  = typename vector_type<uint8_t, 2>::type;
@@ -1702,7 +1918,7 @@ struct NumericLimits<int4_t>
 #endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4

 template <>
-struct NumericLimits<f8_t>
+struct NumericLimits<f8_fnuz_t>
 {
    // negative zero nan mode with exp bias = 8
    static constexpr uint8_t binary_min    = 0x08; // 0b00001000
@@ -1715,17 +1931,17 @@ struct NumericLimits<f8_t>
    // static constexpr uint8_t binary_lowest = 0xF7; // 0b11110111
    // static constexpr uint8_t binary_qnan   = 0x79; // any sign, exp=1111, mant!=0

-    __host__ __device__ static constexpr f8_t Min() { return f8_t(binary_min); }
+    __host__ __device__ static constexpr f8_fnuz_t Min() { return f8_fnuz_t(binary_min); }

-    __host__ __device__ static constexpr f8_t Max() { return f8_t(binary_max); }
+    __host__ __device__ static constexpr f8_fnuz_t Max() { return f8_fnuz_t(binary_max); }

-    __host__ __device__ static constexpr f8_t Lowest() { return f8_t(binary_lowest); }
+    __host__ __device__ static constexpr f8_fnuz_t Lowest() { return f8_fnuz_t(binary_lowest); }

-    __host__ __device__ static constexpr f8_t QuietNaN() { return f8_t(binary_qnan); }
+    __host__ __device__ static constexpr f8_fnuz_t QuietNaN() { return f8_fnuz_t(binary_qnan); }
 };

 template <>
-struct NumericLimits<bf8_t>
+struct NumericLimits<bf8_fnuz_t>
 {
    // negative zero nan mode with exp bias = 16
    static constexpr uint8_t binary_min    = 0x04; // 0b00000100
@@ -1738,13 +1954,59 @@ struct NumericLimits<bf8_t>
    // static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011
    // static constexpr uint8_t binary_qnan   = 0x79; // any sign, exp=1111, mant!=

-    __host__ __device__ static constexpr bf8_t Min() { return bf8_t(binary_min); }
+    __host__ __device__ static constexpr bf8_fnuz_t Min() { return bf8_fnuz_t(binary_min); }

-    __host__ __device__ static constexpr bf8_t Max() { return bf8_t(binary_max); }
+    __host__ __device__ static constexpr bf8_fnuz_t Max() { return bf8_fnuz_t(binary_max); }

-    __host__ __device__ static constexpr bf8_t Lowest() { return bf8_t(binary_lowest); }
+    __host__ __device__ static constexpr bf8_fnuz_t Lowest() { return bf8_fnuz_t(binary_lowest); }

-    __host__ __device__ static constexpr bf8_t QuietNaN() { return bf8_t(binary_qnan); }
+    __host__ __device__ static constexpr bf8_fnuz_t QuietNaN() { return bf8_fnuz_t(binary_qnan); }
+};
+
+template <>
+struct NumericLimits<f8_ocp_t>
+{
+    static constexpr uint8_t binary_min    = 0x08; // 0b00001000 = 2^-6
+    static constexpr uint8_t binary_max    = 0x7E; // 0b01111110 = 448
+    static constexpr uint8_t binary_lowest = 0xFE; // 0b11111110 = -448
+    static constexpr uint8_t binary_qnan   = 0x7F; // 0b01111111
+
+    __host__ __device__ static constexpr f8_ocp_t Min() { return bit_cast<f8_ocp_t>(binary_min); }
+
+    __host__ __device__ static constexpr f8_ocp_t Max() { return bit_cast<f8_ocp_t>(binary_max); }
+
+    __host__ __device__ static constexpr f8_ocp_t Lowest()
+    {
+        return bit_cast<f8_ocp_t>(binary_lowest);
+    }
+
+    __host__ __device__ static constexpr f8_ocp_t QuietNaN()
+    {
+        return bit_cast<f8_ocp_t>(binary_qnan);
+    }
+};
+
+template <>
+struct NumericLimits<bf8_ocp_t>
+{
+    static constexpr uint8_t binary_min    = 0x04; // 0b00000100 = 2^-14
+    static constexpr uint8_t binary_max    = 0x7B; // 0b01111011 = 57344
+    static constexpr uint8_t binary_lowest = 0xFB; // 0b11111011 = -57344
+    static constexpr uint8_t binary_qnan   = 0x7D; // 0b01111101
+
+    __host__ __device__ static constexpr bf8_ocp_t Min() { return bit_cast<bf8_ocp_t>(binary_min); }
+
+    __host__ __device__ static constexpr bf8_ocp_t Max() { return bit_cast<bf8_ocp_t>(binary_max); }
+
+    __host__ __device__ static constexpr bf8_ocp_t Lowest()
+    {
+        return bit_cast<bf8_ocp_t>(binary_lowest);
+    }
+
+    __host__ __device__ static constexpr bf8_ocp_t QuietNaN()
+    {
+        return bit_cast<bf8_ocp_t>(binary_qnan);
+    }
 };

 template <typename T>
@@ -1787,7 +2049,7 @@ struct NumericUtils<half_t>
 };

 template <>
-struct NumericUtils<f8_t>
+struct NumericUtils<f8_fnuz_t>
 {
    static constexpr int exp  = 4;
    static constexpr int mant = 3;
@@ -1796,13 +2058,28 @@ struct NumericUtils<f8_t>
 };

 template <>
-struct NumericUtils<bf8_t>
+struct NumericUtils<bf8_fnuz_t>
 {
    static constexpr int exp  = 5;
    static constexpr int mant = 2;
    static constexpr int bias = 16; // negative zero nan mode
    // static constexpr int bias = 15; // ieee mode
 };
+template <>
+struct NumericUtils<f8_ocp_t>
+{
+    static constexpr int exp  = 4;
+    static constexpr int mant = 3;
+    static constexpr int bias = 7;
+};
+
+template <>
+struct NumericUtils<bf8_ocp_t>
+{
+    static constexpr int exp  = 5;
+    static constexpr int mant = 2;
+    static constexpr int bias = 15;
+};

 template <>
 struct NumericUtils<bhalf_t>

--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -80,7 +80,7 @@ static inline __host__ bool isnan(half_t x)
    return (xx & 0x7FFF) > 0x7C00;
 };

-static inline __host__ bool isnan(f8_t x) { return (x & 0x80); };
+static inline __host__ bool isnan(f8_t x) { return ck::fp8_is_nan(x); };

 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 static inline __host__ bool isnan(int4_t x)
@@ -531,7 +531,7 @@ static inline __device__ bool isnan(half_t x)
    return (xx & 0x7FFF) > 0x7C00;
 };

-static inline __device__ bool isnan(f8_t x) { return (x & 0x80); };
+static inline __device__ bool isnan(f8_t x) { return ck::fp8_is_nan(x); };

 static inline __device__ half_t sqrt(half_t x)
 {

--- a/include/ck/utility/random_gen.hpp
+++ b/include/ck/utility/random_gen.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

+#include "ck/ck.hpp"
+
 namespace ck {

 // Pseudo random number generator
@@ -23,7 +25,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 }

 // version for fp16
-template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<half_t, T>{}, bool> = false>
+template <typename T, uint32_t seed_t, std::enable_if_t<std::is_same<_Float16, T>{}, bool> = false>
 __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
 {
    uint16_t x         = *(reinterpret_cast<uint16_t*>(&val));
@@ -38,9 +40,10 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 }

 // return 0 if data is not fp16 or fp32
-template <typename T,
-          uint32_t seed_t,
-          std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false>
+template <
+    typename T,
+    uint32_t seed_t,
+    std::enable_if_t<!(std::is_same<float, T>{} || std::is_same<_Float16, T>{}), bool> = false>
 __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
 {
    std::ignore = id;

--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -9,7 +9,7 @@
 #include "ck/utility/array.hpp"

 namespace ck {
-// Define the common macro for gfx94x models
+// Define the common macro for MI300 models
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 #define __gfx94__
 #endif
@@ -100,6 +100,18 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
    return type_convert<bhalf_t>(x_fp32);
 }

+template <>
+inline __host__ __device__ constexpr f8_ocp_t type_convert<f8_ocp_t, int>(int x)
+{
+    return f8_ocp_t{type_convert<f8_ocp_t::data_type>(x)};
+}
+
+template <>
+inline __host__ __device__ constexpr bf8_ocp_t type_convert<bf8_ocp_t, int>(int x)
+{
+    return bf8_ocp_t{type_convert<bf8_ocp_t::data_type>(x)};
+}
+
 // Convert X to Y
 template <typename Y, typename X>
 __host__ __device__ constexpr Y type_convert_sp(X x)
@@ -163,7 +175,7 @@ __host__ __device__ constexpr Y f8_convert_sr(X x);

 // convert fp32 to fp8 with stochastic rounding
 template <>
-inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
+inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, float>(float x)
 {
    constexpr int seed = 1254739;
    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
@@ -189,33 +201,35 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    return utils::
-        cast_to_f8<float, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(x,
-                                                                                               rng);
+        cast_to_f8<float, f8_fnuz_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
+            x, rng);
 #endif
 }

 // convert fp16 to fp8 with stochastic rounding
 template <>
-inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
+inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, half_t>(half_t x)
 {
 #if defined(__gfx94__)
    // convert to float and use native converion
-    return f8_convert_sr<f8_t>(type_convert<float>(x));
+    return f8_convert_sr<f8_fnuz_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 1254739;
    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
-    return utils::
-        cast_to_f8<half_t, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<half_t,
+                             f8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }

 // convert fp32 to bf8 with stochastic rounding
 template <>
-inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, float>(float x)
+inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, float>(float x)
 {
    constexpr int seed = 1254739;
    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
@@ -240,28 +254,32 @@ inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, float>(float x)
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
-    return utils::
-        cast_to_f8<float, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<float,
+                             bf8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }

 // convert fp16 to bf8 with stochastic rounding
 template <>
-inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, half_t>(half_t x)
+inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, half_t>(half_t x)
 {
 #if defined(__gfx94__)
    // convert to float and use native converion
-    return f8_convert_sr<bf8_t>(type_convert<float>(x));
+    return f8_convert_sr<bf8_fnuz_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 1254739;
    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
-    return utils::
-        cast_to_f8<half_t, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<half_t,
+                             bf8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }

@@ -271,7 +289,7 @@ __host__ __device__ constexpr Y f8_convert_rne(X x);

 // convert fp32 to fp8 with rounding to nearest even
 template <>
-inline __host__ __device__ f8_t f8_convert_rne<f8_t, float>(float x)
+inline __host__ __device__ f8_fnuz_t f8_convert_rne<f8_fnuz_t, float>(float x)
 {
 #if defined(__gfx94__)
    union
@@ -296,32 +314,34 @@ inline __host__ __device__ f8_t f8_convert_rne<f8_t, float>(float x)
    constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
    constexpr uint32_t rng           = 0;
    return utils::
-        cast_to_f8<float, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(x,
-                                                                                               rng);
+        cast_to_f8<float, f8_fnuz_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
+            x, rng);
 #endif
 }

 // convert fp16 to fp8 with rounding to nearest even
 template <>
-inline __host__ __device__ f8_t f8_convert_rne<f8_t, half_t>(half_t x)
+inline __host__ __device__ f8_fnuz_t f8_convert_rne<f8_fnuz_t, half_t>(half_t x)
 {
 #if defined(__gfx94__)
    // convert to float and use native converion
-    return f8_convert_rne<f8_t>(type_convert<float>(x));
+    return f8_convert_rne<f8_fnuz_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
    constexpr uint32_t rng           = 0;
-    return utils::
-        cast_to_f8<half_t, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<half_t,
+                             f8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }

 // convert fp32 to bf8 with rounding to nearest even
 template <>
-inline __host__ __device__ bf8_t f8_convert_rne<bf8_t, float>(float x)
+inline __host__ __device__ bf8_fnuz_t f8_convert_rne<bf8_fnuz_t, float>(float x)
 {
 #if defined(__gfx94__)
    union
@@ -345,44 +365,59 @@ inline __host__ __device__ bf8_t f8_convert_rne<bf8_t, float>(float x)
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
    constexpr uint32_t rng           = 0;
-    return utils::
-        cast_to_f8<float, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<float,
+                             bf8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }

 // convert fp16 to bf8 with rounding to nearest even
 template <>
-inline __host__ __device__ bf8_t f8_convert_rne<bf8_t, half_t>(half_t x)
+inline __host__ __device__ bf8_fnuz_t f8_convert_rne<bf8_fnuz_t, half_t>(half_t x)
 {
 #if defined(__gfx94__)
    // convert to float and use native converion
-    return f8_convert_rne<bf8_t>(type_convert<float>(x));
+    return f8_convert_rne<bf8_fnuz_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
    constexpr uint32_t rng           = 0;
-    return utils::
-        cast_to_f8<half_t, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<half_t,
+                             bf8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
+#endif
+}
+
+// convert fp32 to fp8
+template <>
+inline __host__ __device__ f8_fnuz_t type_convert<f8_fnuz_t, float>(float x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<f8_fnuz_t>(x);
+#else
+    return f8_convert_rne<f8_fnuz_t>(x);
 #endif
 }

 // convert fp32 to fp8
 template <>
-inline __host__ __device__ f8_t type_convert<f8_t, float>(float x)
+inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, float>(float x)
 {
 #if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<f8_t>(x);
+    return f8_convert_sr<f8_ocp_t>(x);
 #else
-    return f8_convert_rne<f8_t>(x);
+    return f8_convert_rne<f8_ocp_t>(x);
 #endif
 }

 // convert fp8 to fp32
 template <>
-inline __host__ __device__ float type_convert<float, f8_t>(f8_t x)
+inline __host__ __device__ float type_convert<float, f8_fnuz_t>(f8_fnuz_t x)
 {
 #if defined(__gfx94__)
    float fval;
@@ -392,30 +427,44 @@ inline __host__ __device__ float type_convert<float, f8_t>(f8_t x)
    return fval;
 #else
    constexpr bool negative_zero_nan = true;
-    return utils::cast_from_f8<f8_t, float, negative_zero_nan>(x);
+    return utils::cast_from_f8<f8_fnuz_t, float, negative_zero_nan>(x);
 #endif
 }

 template <>
-inline __host__ __device__ float2_t type_convert<float2_t, f8x2_t>(f8x2_t x)
+inline __host__ __device__ float2_t type_convert<float2_t, f8x2_fnuz_t>(f8x2_fnuz_t x)
 {
 #if defined(__gfx94__)
    const auto i16val = bit_cast<uint16_t>(x);
    return __builtin_amdgcn_cvt_pk_f32_fp8(i16val, 0);
 #else
    constexpr bool negative_zero_nan = true;
-    const auto f8x2_v                = vector_type<f8_t, 2>(x);
+    const auto f8x2_v                = vector_type<f8_fnuz_t, 2>(x);
    vector_type<float, 2> f32x2_v;
    f32x2_v.template AsType<float>()(Number<0>{}) =
-        utils::cast_from_f8<f8_t, float, negative_zero_nan>(
-            f8x2_v.template AsType<f8_t>()[Number<0>{}]);
+        utils::cast_from_f8<f8_fnuz_t, float, negative_zero_nan>(
+            f8x2_v.template AsType<f8_fnuz_t>()[Number<0>{}]);
    f32x2_v.template AsType<float>()(Number<1>{}) =
-        utils::cast_from_f8<f8_t, float, negative_zero_nan>(
-            f8x2_v.template AsType<f8_t>()[Number<1>{}]);
+        utils::cast_from_f8<f8_fnuz_t, float, negative_zero_nan>(
+            f8x2_v.template AsType<f8_fnuz_t>()[Number<1>{}]);
    return f32x2_v.template AsType<float2_t>()[Number<0>{}];
 #endif
 }

+template <>
+inline __host__ __device__ float2_t type_convert<float2_t, f8x2_ocp_t>(f8x2_ocp_t x)
+{
+#if CK_OCP_FP8_CVT_FAST_PATH
+    return fp8_impl::cast_to_f32x2_from_f8x2<f8_ocp_t::default_interpret>(
+        x.AsType<fp8_impl::fp8x2_storage_t>()[Number<0>{}]);
+#else
+    return float2_t{fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(
+                        x.AsType<fp8_storage_t>()[Number<0>{}]),
+                    fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(
+                        x.AsType<fp8_storage_t>()[Number<1>{}])};
+#endif
+}
+
 template <>
 inline __host__ __device__ half2_t type_convert<half2_t, float2_t>(float2_t x)
 {
@@ -428,42 +477,64 @@ inline __host__ __device__ half2_t type_convert<half2_t, float2_t>(float2_t x)

 // convert fp16 to fp8
 template <>
-inline __host__ __device__ f8_t type_convert<f8_t, half_t>(half_t x)
+inline __host__ __device__ f8_fnuz_t type_convert<f8_fnuz_t, half_t>(half_t x)
 {
 #if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<f8_t>(x);
+    return f8_convert_sr<f8_fnuz_t>(x);
 #else
-    return f8_convert_rne<f8_t>(x);
+    return f8_convert_rne<f8_fnuz_t>(x);
+#endif
+}
+
+// convert fp16 to fp8
+template <>
+inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, half_t>(half_t x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<f8_ocp_t>(x);
+#else
+    return f8_convert_rne<f8_ocp_t>(x);
 #endif
 }

 // convert fp8 to fp16
 template <>
-inline __host__ __device__ half_t type_convert<half_t, f8_t>(f8_t x)
+inline __host__ __device__ half_t type_convert<half_t, f8_fnuz_t>(f8_fnuz_t x)
 {
 #if defined(__gfx94__)
    // use native conversion to float and convert to fp16
    return type_convert<half_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
-    return utils::cast_from_f8<f8_t, half_t, negative_zero_nan>(x);
+    return utils::cast_from_f8<f8_fnuz_t, half_t, negative_zero_nan>(x);
+#endif
+}
+
+// convert fp32 to bf8
+template <>
+inline __host__ __device__ bf8_fnuz_t type_convert<bf8_fnuz_t, float>(float x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<bf8_fnuz_t>(x);
+#else
+    return f8_convert_rne<bf8_fnuz_t>(x);
 #endif
 }

 // convert fp32 to bf8
 template <>
-inline __host__ __device__ bf8_t type_convert<bf8_t, float>(float x)
+inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, float>(float x)
 {
 #if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<bf8_t>(x);
+    return f8_convert_sr<bf8_ocp_t>(x);
 #else
-    return f8_convert_rne<bf8_t>(x);
+    return f8_convert_rne<bf8_ocp_t>(x);
 #endif
 }

 // convert bf8 to fp32
 template <>
-inline __host__ __device__ float type_convert<float, bf8_t>(bf8_t x)
+inline __host__ __device__ float type_convert<float, bf8_fnuz_t>(bf8_fnuz_t x)
 {
 #if defined(__gfx94__)
    float fval;
@@ -473,31 +544,42 @@ inline __host__ __device__ float type_convert<float, bf8_t>(bf8_t x)
    return fval;
 #else
    constexpr bool negative_zero_nan = true;
-    return utils::cast_from_f8<bf8_t, float, negative_zero_nan>(x);
+    return utils::cast_from_f8<bf8_fnuz_t, float, negative_zero_nan>(x);
+#endif
+}
+
+// convert fp16 to bf8
+template <>
+inline __host__ __device__ bf8_fnuz_t type_convert<bf8_fnuz_t, half_t>(half_t x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<bf8_fnuz_t>(x);
+#else
+    return f8_convert_rne<bf8_fnuz_t>(x);
 #endif
 }

 // convert fp16 to bf8
 template <>
-inline __host__ __device__ bf8_t type_convert<bf8_t, half_t>(half_t x)
+inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, half_t>(half_t x)
 {
 #if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<bf8_t>(x);
+    return f8_convert_sr<bf8_ocp_t>(x);
 #else
-    return f8_convert_rne<bf8_t>(x);
+    return f8_convert_rne<bf8_ocp_t>(x);
 #endif
 }

 // convert bf8 to fp16
 template <>
-inline __host__ __device__ half_t type_convert<half_t, bf8_t>(bf8_t x)
+inline __host__ __device__ half_t type_convert<half_t, bf8_fnuz_t>(bf8_fnuz_t x)
 {
 #if defined(__gfx94__)
    // use native conversion to float and convert to fp16
    return type_convert<half_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
-    return utils::cast_from_f8<bf8_t, half_t, negative_zero_nan>(x);
+    return utils::cast_from_f8<bf8_fnuz_t, half_t, negative_zero_nan>(x);
 #endif
 }


--- a/include/ck_tile/README.md
+++ b/include/ck_tile/README.md
-# ck_tile
+[Back to the main page](../../README.md)
+# Composable Kernel Tile
 ## concept
 `ck_tile` provides a programming model with templated abstractions to enable users to implement performance-critical kernels for machine learning workloads. introduces following basic concepts to help users building your own operator
 - tensor coordinate transformation, this is the core concept of layout/index transform abstraction in both compiler time and run time.

--- a/include/ck_tile/core/utility/amd_address_space.hpp
+++ b/include/ck_tile/core/utility/amd_address_space.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core/config.hpp"
+
+// Address Space for AMDGCN
+// https://llvm.org/docs/AMDGPUUsage.html#address-space
+
+namespace ck_tile {
+
+#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
+
+template <typename T>
+__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p)
+{
+    // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
+    // only c-style pointer cast seems be able to be compiled
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+    return (T*)p; // NOLINT(old-style-cast)
+#pragma clang diagnostic pop
+}
+
+template <typename T>
+__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p)
+{
+    // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
+    // only c-style pointer cast seems be able to be compiled
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+    return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
+#pragma clang diagnostic pop
+}
+
+} // namespace ck_tile
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -183,4 +183,116 @@ void reference_gemm_gpu(DeviceMem& a_device,

    return;
 }
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename LayoutA,
+          typename LayoutB,
+          typename LayoutC>
+void reference_batched_gemm_gpu(DeviceMem& a_device,
+                                DeviceMem& b_device,
+                                DeviceMem& c_device,
+                                index_t M,
+                                index_t N,
+                                index_t K,
+                                index_t stride_a,
+                                index_t stride_b,
+                                index_t stride_c,
+                                index_t batch_stride_A,
+                                index_t batch_stride_B,
+                                index_t batch_stride_C,
+                                index_t batch_count)
+{
+
+    ADataType* d_A;
+    BDataType* d_B;
+    CDataType* d_C;
+
+    hipError_t errA = hipMalloc(&d_A, batch_count * M * K * sizeof(ADataType));
+    hipError_t errB = hipMalloc(&d_B, batch_count * N * K * sizeof(BDataType));
+    hipError_t errC = hipMalloc(&d_C, batch_count * M * N * sizeof(CDataType));
+    if(errA != hipSuccess)
+    {
+        std::cerr << "Error allocating device memory for A: " << hipGetErrorString(errA)
+                  << std::endl;
+        return; // Early exit on error
+    }
+
+    if(errB != hipSuccess)
+    {
+        std::cerr << "Error allocating device memory for B: " << hipGetErrorString(errB)
+                  << std::endl;
+        return; // Early exit on error
+    }
+
+    if(errC != hipSuccess)
+    {
+        std::cerr << "Error allocating device memory for C: " << hipGetErrorString(errC)
+                  << std::endl;
+        return; // Early exit on error
+    }
+
+    errA = hipMemcpy(d_A,
+                     a_device.GetDeviceBuffer(),
+                     batch_count * M * K * sizeof(ADataType),
+                     hipMemcpyHostToDevice);
+    if(errA != hipSuccess)
+    {
+        std::cerr << "Error copying A to device: " << hipGetErrorString(errA) << std::endl;
+    }
+
+    errB = hipMemcpy(d_B,
+                     b_device.GetDeviceBuffer(),
+                     batch_count * N * K * sizeof(BDataType),
+                     hipMemcpyHostToDevice);
+    if(errB != hipSuccess)
+    {
+        std::cerr << "Error copying B to device: " << hipGetErrorString(errB) << std::endl;
+    }
+
+    int totalElements      = M * N;
+    int numThreadsPerBlock = 256; // Common choice for threads per block
+    int numBlocks          = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
+
+    for(index_t batch_id = 0; batch_id < batch_count; ++batch_id)
+    {
+        ADataType* d_ATemp = d_A + batch_id * batch_stride_A;
+        BDataType* d_BTemp = d_B + batch_id * batch_stride_B;
+        CDataType* d_CTemp = d_C + batch_id * batch_stride_C;
+        naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
+            <<<numBlocks, numThreadsPerBlock>>>(
+                d_ATemp, d_BTemp, d_CTemp, M, N, K, stride_a, stride_b, stride_c);
+    }
+
+    errC = hipMemcpy(c_device.GetDeviceBuffer(),
+                     d_C,
+                     batch_count * M * N * sizeof(CDataType),
+                     hipMemcpyDeviceToHost);
+    if(errC != hipSuccess)
+    {
+        std::cerr << "Error copying C to device: " << hipGetErrorString(errC) << std::endl;
+    }
+
+    errA = hipFree(d_A);
+    if(errA != hipSuccess)
+    {
+        std::cerr << "Error free the A memory: " << hipGetErrorString(errA) << std::endl;
+    }
+
+    errB = hipFree(d_B);
+    if(errB != hipSuccess)
+    {
+        std::cerr << "Error free the B memory: " << hipGetErrorString(errB) << std::endl;
+    }
+
+    errC = hipFree(d_C);
+    if(errC != hipSuccess)
+    {
+        std::cerr << "Error free the C memory: " << hipGetErrorString(errC) << std::endl;
+    }
+
+    return;
+}
 } // namespace ck_tile
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -998,14 +998,14 @@ struct FmhaFwdKernel
                return pad_tensor_view(
                    q_dram_naive,
                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                    sequence<false, kPadHeadDimQ>{});
            }
            else
            {
                return pad_tensor_view(
                    q_dram_naive,
                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                    sequence<false, kPadHeadDimQ>{});
            }
        }();
        const auto k_dram = [&]() {
@@ -1019,7 +1019,7 @@ struct FmhaFwdKernel
            return pad_tensor_view(
                k_dram_naive,
                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                sequence<kPadSeqLenK, kPadHeadDimQ>{});
+                sequence<false, kPadHeadDimQ>{});
        }();
        const auto v_dram = [&]() {
            if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
@@ -1041,7 +1041,7 @@ struct FmhaFwdKernel
                return pad_tensor_view(
                    v_dram_transposed,
                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK>{});
+                    sequence<kPadHeadDimV, false>{});
            }
            else
            {
@@ -1055,7 +1055,7 @@ struct FmhaFwdKernel
                return pad_tensor_view(
                    v_dram_naive,
                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK>{});
+                    sequence<false, kPadSeqLenK>{});
            }
        }();

@@ -1097,9 +1097,8 @@ struct FmhaFwdKernel
                        number<FmhaPipeline::kAlignmentBias>{},
                        number<1>{});

-                    return pad_tensor_view(bias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    return pad_tensor_view(
+                        bias_dram_naive, bias_dram_window_lengths, sequence<false, kPadSeqLenK>{});
                }();

                return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});

--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
@@ -339,7 +339,7 @@ struct FmhaFwdSplitKVCombineKernel
                number<FmhaPipeline::kAlignmentOacc>{},
                number<1>{});

-            auto o_acc_dram_view = pad_tensor_view(
+            const auto o_acc_dram_view = pad_tensor_view(
                o_acc_dram_naive,
                make_tuple(number<1>{}, number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
                sequence<false, kPadSeqLenQ, kPadHeadDimV>{});

--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -623,14 +623,14 @@ struct FmhaFwdSplitKVKernel
                return pad_tensor_view(
                    q_dram_naive,
                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                    sequence<false, kPadHeadDimQ>{});
            }
            else
            {
                return pad_tensor_view(
                    q_dram_naive,
                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
-                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
+                    sequence<false, kPadHeadDimQ>{});
            }
        }();

@@ -645,7 +645,7 @@ struct FmhaFwdSplitKVKernel
            return pad_tensor_view(
                k_dram_naive,
                make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
-                sequence<kPadSeqLenK, kPadHeadDimQ>{});
+                sequence<false, kPadHeadDimQ>{});
        };
        const auto k_dram = [&]() {
            if constexpr(kIsPagedKV)
@@ -678,7 +678,7 @@ struct FmhaFwdSplitKVKernel
                return pad_tensor_view(
                    v_dram_transposed,
                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK>{});
+                    sequence<kPadHeadDimV, false>{});
            }
            else
            {
@@ -692,7 +692,7 @@ struct FmhaFwdSplitKVKernel
                return pad_tensor_view(
                    v_dram_naive,
                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, kPadSeqLenK>{});
+                    sequence<false, kPadSeqLenK>{});
            }
        };
        const auto v_dram = [&]() {
@@ -804,9 +804,8 @@ struct FmhaFwdSplitKVKernel
                        number<FmhaPipeline::kAlignmentBias>{},
                        number<1>{});

-                    return pad_tensor_view(bias_dram_naive,
-                                           bias_dram_window_lengths,
-                                           sequence<kPadSeqLenQ, kPadSeqLenK>{});
+                    return pad_tensor_view(
+                        bias_dram_naive, bias_dram_window_lengths, sequence<false, kPadSeqLenK>{});
                }();

                return make_tile_window(bias_dram, bias_dram_window_lengths, {i_m0, 0});

--- a/include/ck_tile/ops/gemm.hpp
+++ b/include/ck_tile/ops/gemm.hpp
@@ -25,6 +25,10 @@
 #include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
+#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
+#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"

--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -41,13 +41,16 @@ struct BlockUniversalGemmAsBsCr
        static constexpr index_t MWarp = config.template at<1>();
        static constexpr index_t NWarp = config.template at<2>();

-        static_assert(MWarp == BlockGemmShape::BlockWarps::at(number<0>{}),
+        using I0 = number<0>;
+        using I1 = number<1>;
+
+        static_assert(MWarp == BlockGemmShape::BlockWarps::at(I0{}),
                      "Error! WarpGemm's MWarp is not consisten with BlockGemmShape!");
-        static_assert(NWarp == BlockGemmShape::BlockWarps::at(number<1>{}),
+        static_assert(NWarp == BlockGemmShape::BlockWarps::at(I1{}),
                      "Error! WarpGemm's NWarp is not consisten with BlockGemmShape!");
-        static_assert(WarpGemm::kM == BlockGemmShape::WarpTile::at(number<0>{}),
+        static_assert(WarpGemm::kM == BlockGemmShape::WarpTile::at(I0{}),
                      "Error! WarpGemm's M is not consisten with BlockGemmShape!");
-        static_assert(WarpGemm::kN == BlockGemmShape::WarpTile::at(number<1>{}),
+        static_assert(WarpGemm::kN == BlockGemmShape::WarpTile::at(I1{}),
                      "Error! WarpGemm's N is not consisten with BlockGemmShape!");

        static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM);
@@ -99,6 +102,9 @@ struct BlockUniversalGemmAsBsCr

    static constexpr auto Scheduler = Traits::Scheduler;

+    using I0 = number<0>;
+    using I1 = number<1>;
+
    private:
    template <GemmPipelineScheduler Scheduler, typename GemmTraits>
    struct BlockGemmImpl
@@ -114,35 +120,31 @@ struct BlockUniversalGemmAsBsCr
                                       const ASmemBlockWindow& a_block_window,
                                       const BSmemBlockWindow& b_block_window)
        {
-            static_assert(
-                std::is_same_v<typename GemmTraits::CDataType, typename CBlockTensor::DataType>,
-                "The CDataType as defined in traits should be the same as correspoinding "
-                "C block tensor data type!");
-            static_assert(std::is_same_v<typename GemmTraits::ADataType,
-                                         typename ASmemBlockWindow::DataType> &&
-                              std::is_same_v<typename GemmTraits::BDataType,
-                                             typename BSmemBlockWindow::DataType>,
+            static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
+                          "The CDataType as defined in traits should be the same as correspoinding "
+                          "C block tensor data type!");
+            static_assert(std::is_same_v<ADataType, typename ASmemBlockWindow::DataType> &&
+                              std::is_same_v<BDataType, typename BSmemBlockWindow::DataType>,
                          "The ADataType and BDataType as defined in "
                          "traits should be the same as correspoinding block window data type!");

            static_assert(
-                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
-                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
-                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}],
+                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] &&
+                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] &&
+                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}],
                "MPerBlock, NPerBlock, KPerBlock defined in "
                " BlockGemmShape are different from A/B block smem windows apropriate dims!");

-            const index_t iMWarp = get_warp_id() / GemmTraits::NWarp;
-            const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp);
+            const index_t iMWarp = get_warp_id() / NWarp;
+            const index_t iNWarp = get_warp_id() - (iMWarp * NWarp);

            // TODO: refactor warp_window tile type to class member as it should be
            // compile-time known information.
            auto a_warp_window_tmp = make_tile_window(
                a_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::WarpGemm::kM>{}, number<GemmTraits::WarpGemm::kK>{}),
-                a_block_window.get_window_origin() +
-                    multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0},
-                make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{}));
+                make_tuple(number<WarpGemm::kM>{}, number<WarpGemm::kK>{}),
+                a_block_window.get_window_origin() + multi_index<2>{iMWarp * WarpGemm::kM, 0},
+                make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{}));

            using AWarpWindow = remove_cvref_t<decltype(a_warp_window_tmp)>;

@@ -156,16 +158,15 @@ struct BlockUniversalGemmAsBsCr

            statically_indexed_array<
                statically_indexed_array<AWarpWindow, GemmTraits::KIterPerWarp>,
-                GemmTraits::MIterPerWarp>
+                MIterPerWarp>
                a_warp_windows;

            // construct B-warp-window
            auto b_warp_window_tmp = make_tile_window(
                b_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::WarpGemm::kN>{}, number<GemmTraits::WarpGemm::kK>{}),
-                b_block_window.get_window_origin() +
-                    multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0},
-                make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{}));
+                make_tuple(number<WarpGemm::kN>{}, number<WarpGemm::kK>{}),
+                b_block_window.get_window_origin() + multi_index<2>{iNWarp * WarpGemm::kN, 0},
+                make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{}));

            using BWarpWindow = remove_cvref_t<decltype(b_warp_window_tmp)>;

@@ -179,10 +180,10 @@ struct BlockUniversalGemmAsBsCr

            statically_indexed_array<
                statically_indexed_array<BWarpWindow, GemmTraits::KIterPerWarp>,
-                GemmTraits::NIterPerWarp>
+                NIterPerWarp>
                b_warp_windows;

-            static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
                    a_warp_windows(mIter)(kIter) = a_warp_window_tmp;

@@ -193,7 +194,7 @@ struct BlockUniversalGemmAsBsCr
                });
            });

-            static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
                    b_warp_windows(nIter)(kIter) = b_warp_window_tmp;

@@ -203,8 +204,8 @@ struct BlockUniversalGemmAsBsCr
                });
            });

-            using CWarpDstr   = typename GemmTraits::WarpGemm::CWarpDstr;
-            using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor;
+            using CWarpDstr   = typename WarpGemm::CWarpDstr;
+            using CWarpTensor = typename WarpGemm::CWarpTensor;

            constexpr auto c_warp_y_lengths =
                to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
@@ -212,10 +213,10 @@ struct BlockUniversalGemmAsBsCr

            // hot loop:
            static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                    const auto a_warp_tile = load_tile(a_warp_windows(mIter)(kIter));

-                    static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                        const auto b_warp_tile = load_tile(b_warp_windows(nIter)(kIter));

                        // read C warp tensor from C block tensor-
@@ -226,7 +227,7 @@ struct BlockUniversalGemmAsBsCr
                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));

                        // warp GEMM
-                        typename GemmTraits::WarpGemm{}(c_warp_tensor, a_warp_tile, b_warp_tile);
+                        WarpGemm{}(c_warp_tensor, a_warp_tile, b_warp_tile);

                        // write C warp tensor into C block tensor
                        c_block_tensor.set_y_sliced_thread_data(
@@ -243,13 +244,13 @@ struct BlockUniversalGemmAsBsCr
    struct BlockGemmImpl<GemmPipelineScheduler::Intrawave, GemmTraits>
    {
        statically_indexed_array<
-            statically_indexed_array<typename GemmTraits::AWarpTile, GemmTraits::KIterPerWarp>,
-            GemmTraits::MIterPerWarp>
+            statically_indexed_array<typename GemmTraits::AWarpTile, KIterPerWarp>,
+            MIterPerWarp>
            a_warp_tiles_;

        statically_indexed_array<
-            statically_indexed_array<typename GemmTraits::BWarpTile, GemmTraits::KIterPerWarp>,
-            GemmTraits::NIterPerWarp>
+            statically_indexed_array<typename GemmTraits::BWarpTile, KIterPerWarp>,
+            NIterPerWarp>
            b_warp_tiles_;

        template <typename ASmemBlockWindow, typename BSmemBlockWindow>
@@ -257,30 +258,27 @@ struct BlockUniversalGemmAsBsCr
                                          const BSmemBlockWindow& b_block_window)
        {
            static_assert(
-                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
-                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
-                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}],
+                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] &&
+                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] &&
+                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}],
                "MPerBlock, NPerBlock, KPerBlock defined in "
                " BlockGemmShape are different from A/B block smem windows apropriate dims!");

-            static_assert(std::is_same_v<typename GemmTraits::ADataType,
-                                         typename ASmemBlockWindow::DataType> &&
-                              std::is_same_v<typename GemmTraits::BDataType,
-                                             typename BSmemBlockWindow::DataType>,
+            static_assert(std::is_same_v<ADataType, typename ASmemBlockWindow::DataType> &&
+                              std::is_same_v<BDataType, typename BSmemBlockWindow::DataType>,
                          "The ADataType and BDataType as defined in "
                          "traits should be the same as correspoinding block window data type!");

-            const index_t iMWarp = get_warp_id() / GemmTraits::NWarp;
-            const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp);
+            const index_t iMWarp = get_warp_id() / NWarp;
+            const index_t iNWarp = get_warp_id() - (iMWarp * NWarp);

            // TODO: refactor warp_window tile type to class member as it should be
            // compile-time known information.
            auto a_warp_window_tmp = make_tile_window(
                a_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::WarpGemm::kM>{}, number<GemmTraits::WarpGemm::kK>{}),
-                a_block_window.get_window_origin() +
-                    multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, 0},
-                make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{}));
+                make_tuple(number<WarpGemm::kM>{}, number<WarpGemm::kK>{}),
+                a_block_window.get_window_origin() + multi_index<2>{iMWarp * WarpGemm::kM, 0},
+                make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{}));

            using AWarpWindow = remove_cvref_t<decltype(a_warp_window_tmp)>;

@@ -292,18 +290,16 @@ struct BlockUniversalGemmAsBsCr
                              AWarpWindow{}.get_window_lengths(),
                          "AWarpWindow lengths must be equal to AWarpTile lengths!");

-            statically_indexed_array<
-                statically_indexed_array<AWarpWindow, GemmTraits::KIterPerWarp>,
-                GemmTraits::MIterPerWarp>
+            statically_indexed_array<statically_indexed_array<AWarpWindow, KIterPerWarp>,
+                                     MIterPerWarp>
                a_warp_windows;

            // construct B-warp-window
            auto b_warp_window_tmp = make_tile_window(
                b_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::WarpGemm::kN>{}, number<GemmTraits::WarpGemm::kK>{}),
-                b_block_window.get_window_origin() +
-                    multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, 0},
-                make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{}));
+                make_tuple(number<WarpGemm::kN>{}, number<WarpGemm::kK>{}),
+                b_block_window.get_window_origin() + multi_index<2>{iNWarp * WarpGemm::kN, 0},
+                make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{}));

            using BWarpWindow = remove_cvref_t<decltype(b_warp_window_tmp)>;

@@ -315,13 +311,12 @@ struct BlockUniversalGemmAsBsCr
                              BWarpWindow{}.get_window_lengths(),
                          "BWarpWindow lengths must be equal to BWarpTile lengths!");

-            statically_indexed_array<
-                statically_indexed_array<BWarpWindow, GemmTraits::KIterPerWarp>,
-                GemmTraits::NIterPerWarp>
+            statically_indexed_array<statically_indexed_array<BWarpWindow, KIterPerWarp>,
+                                     NIterPerWarp>
                b_warp_windows;

-            static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
-                static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
                    a_warp_windows(mIter)(kIter) = a_warp_window_tmp;

                    // TODO: I don't have to move 0,0 window!
@@ -331,8 +326,8 @@ struct BlockUniversalGemmAsBsCr
                });
            });

-            static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
-                static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
                    b_warp_windows(nIter)(kIter) = b_warp_window_tmp;

                    move_tile_window(b_warp_windows(nIter)(kIter),
@@ -341,12 +336,12 @@ struct BlockUniversalGemmAsBsCr
                });
            });

-            static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                    // read A warp tensor from A block window
                    load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter));
                });
-                static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                    // read B warp tensor from B Block window
                    load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter));
                });
@@ -359,22 +354,21 @@ struct BlockUniversalGemmAsBsCr
                                       [[maybe_unused]] const ASmemBlockWindow& a_block_window,
                                       [[maybe_unused]] const BSmemBlockWindow& b_block_window)
        {
-            static_assert(
-                std::is_same_v<typename GemmTraits::CDataType, typename CBlockTensor::DataType>,
-                "The CDataType as defined in traits should be the same as correspoinding "
-                "C block tensor data type!");
+            static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
+                          "The CDataType as defined in traits should be the same as correspoinding "
+                          "C block tensor data type!");

-            using CWarpDstr   = typename GemmTraits::WarpGemm::CWarpDstr;
-            using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor;
+            using CWarpDstr   = typename WarpGemm::CWarpDstr;
+            using CWarpTensor = typename WarpGemm::CWarpTensor;

            constexpr auto c_warp_y_lengths =
                to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
            constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};

            // hot loop:
-            static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
-                static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
-                    static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                        // read C warp tensor from C block tensor-
                        CWarpTensor c_warp_tensor;

@@ -383,9 +377,9 @@ struct BlockUniversalGemmAsBsCr
                            merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));

                        // warp GEMM
-                        typename GemmTraits::WarpGemm{}(c_warp_tensor,
-                                                        a_warp_tiles_[mIter][kIter],
-                                                        b_warp_tiles_[nIter][kIter]);
+                        WarpGemm{}(c_warp_tensor,
+                                   a_warp_tiles_[mIter][kIter],
+                                   b_warp_tiles_[nIter][kIter]);

                        // write C warp tensor into C block tensor
                        c_block_tensor.set_y_sliced_thread_data(
@@ -412,12 +406,12 @@ struct BlockUniversalGemmAsBsCr

        statically_indexed_array<
            statically_indexed_array<typename GemmTraits::AWarpTile, KInnerLoopIter>,
-            GemmTraits::MIterPerWarp>
+            MIterPerWarp>
            a_warp_tiles_;

        statically_indexed_array<
            statically_indexed_array<typename GemmTraits::BWarpTile, KInnerLoopIter>,
-            GemmTraits::NIterPerWarp>
+            NIterPerWarp>
            b_warp_tiles_;

        template <index_t KIdx, typename ASmemBlockWindow, typename BSmemBlockWindow>
@@ -425,30 +419,28 @@ struct BlockUniversalGemmAsBsCr
                                          const BSmemBlockWindow& b_block_window)
        {
            static_assert(
-                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
-                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[number<0>{}] &&
-                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[number<1>{}],
+                GemmTraits::MPerBlock == ASmemBlockWindow{}.get_window_lengths()[I0{}] &&
+                    GemmTraits::NPerBlock == BSmemBlockWindow{}.get_window_lengths()[I0{}] &&
+                    GemmTraits::KPerBlock == ASmemBlockWindow{}.get_window_lengths()[I1{}],
                "MPerBlock, NPerBlock, KPerBlock defined in "
                " BlockGemmShape are different from A/B block smem windows apropriate dims!");

-            static_assert(std::is_same_v<typename GemmTraits::ADataType,
-                                         typename ASmemBlockWindow::DataType> &&
-                              std::is_same_v<typename GemmTraits::BDataType,
-                                             typename BSmemBlockWindow::DataType>,
+            static_assert(std::is_same_v<ADataType, typename ASmemBlockWindow::DataType> &&
+                              std::is_same_v<BDataType, typename BSmemBlockWindow::DataType>,
                          "The ADataType and BDataType as defined in "
                          "traits should be the same as correspoinding block window data type!");

-            const index_t iMWarp = get_warp_id() / GemmTraits::NWarp;
-            const index_t iNWarp = get_warp_id() - (iMWarp * GemmTraits::NWarp);
+            const index_t iMWarp = get_warp_id() / NWarp;
+            const index_t iNWarp = get_warp_id() - (iMWarp * NWarp);

            // TODO: refactor warp_window tile type to class member as it should be
            // compile-time known information.
            auto a_warp_window_tmp = make_tile_window(
                a_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::WarpGemm::kM>{}, number<GemmTraits::WarpGemm::kK>{}),
+                make_tuple(number<WarpGemm::kM>{}, number<WarpGemm::kK>{}),
                a_block_window.get_window_origin() +
-                    multi_index<2>{iMWarp * GemmTraits::WarpGemm::kM, KIdx * KPerInnerLoop},
-                make_static_tile_distribution(typename GemmTraits::WarpGemm::AWarpDstrEncoding{}));
+                    multi_index<2>{iMWarp * WarpGemm::kM, KIdx * KPerInnerLoop},
+                make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{}));

            using AWarpWindow = remove_cvref_t<decltype(a_warp_window_tmp)>;

@@ -461,16 +453,16 @@ struct BlockUniversalGemmAsBsCr
                          "AWarpWindow lengths must be equal to AWarpTile lengths!");

            statically_indexed_array<statically_indexed_array<AWarpWindow, KInnerLoopIter>,
-                                     GemmTraits::MIterPerWarp>
+                                     MIterPerWarp>
                a_warp_windows;

            // construct B-warp-window
            auto b_warp_window_tmp = make_tile_window(
                b_block_window.get_bottom_tensor_view(),
-                make_tuple(number<GemmTraits::WarpGemm::kN>{}, number<GemmTraits::WarpGemm::kK>{}),
+                make_tuple(number<WarpGemm::kN>{}, number<WarpGemm::kK>{}),
                b_block_window.get_window_origin() +
-                    multi_index<2>{iNWarp * GemmTraits::WarpGemm::kN, KIdx * KPerInnerLoop},
-                make_static_tile_distribution(typename GemmTraits::WarpGemm::BWarpDstrEncoding{}));
+                    multi_index<2>{iNWarp * WarpGemm::kN, KIdx * KPerInnerLoop},
+                make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{}));

            using BWarpWindow = remove_cvref_t<decltype(b_warp_window_tmp)>;

@@ -483,10 +475,10 @@ struct BlockUniversalGemmAsBsCr
                          "BWarpWindow lengths must be equal to BWarpTile lengths!");

            statically_indexed_array<statically_indexed_array<BWarpWindow, KInnerLoopIter>,
-                                     GemmTraits::NIterPerWarp>
+                                     NIterPerWarp>
                b_warp_windows;

-            static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
                    a_warp_windows(mIter)(kIter) = a_warp_window_tmp;

@@ -496,7 +488,7 @@ struct BlockUniversalGemmAsBsCr
                });
            });

-            static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
                    b_warp_windows(nIter)(kIter) = b_warp_window_tmp;

@@ -508,11 +500,11 @@ struct BlockUniversalGemmAsBsCr

            // TODO check if a_warp_tiles has same desc as a_warp_window
            static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
-                static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                    // read A warp tensor from A block window
                    load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter));
                });
-                static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                    // read B warp tensor from B Block window
                    load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter));
                });
@@ -525,13 +517,12 @@ struct BlockUniversalGemmAsBsCr
                                       const ASmemBlockWindow& a_block_window,
                                       const BSmemBlockWindow& b_block_window)
        {
-            static_assert(
-                std::is_same_v<typename GemmTraits::CDataType, typename CBlockTensor::DataType>,
-                "The CDataType as defined in traits should be the same as correspoinding "
-                "C block tensor data type!");
+            static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
+                          "The CDataType as defined in traits should be the same as correspoinding "
+                          "C block tensor data type!");

-            using CWarpDstr   = typename GemmTraits::WarpGemm::CWarpDstr;
-            using CWarpTensor = typename GemmTraits::WarpGemm::CWarpTensor;
+            using CWarpDstr   = typename WarpGemm::CWarpDstr;
+            using CWarpTensor = typename WarpGemm::CWarpTensor;

            constexpr auto c_warp_y_lengths =
                to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
@@ -555,8 +546,8 @@ struct BlockUniversalGemmAsBsCr
                }

                static_for<0, KInnerLoopIter, 1>{}([&](auto kInnerIter) {
-                    static_for<0, GemmTraits::MIterPerWarp, 1>{}([&](auto mIter) {
-                        static_for<0, GemmTraits::NIterPerWarp, 1>{}([&](auto nIter) {
+                    static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                            // read C warp tensor from C block tensor-
                            CWarpTensor c_warp_tensor;

@@ -573,17 +564,17 @@ struct BlockUniversalGemmAsBsCr
                            // penalty
                            if constexpr(kIter.value == KRepeat - 1 &&
                                         kInnerIter.value == KInnerLoopIter - 1 &&
-                                         mIter.value == GemmTraits::MIterPerWarp - 1 &&
-                                         nIter.value == GemmTraits::NIterPerWarp - 1)
+                                         mIter.value == MIterPerWarp - 1 &&
+                                         nIter.value == NIterPerWarp - 1)
                            {
                                __builtin_amdgcn_sched_barrier(0);
                                block_sync_lds();
                                __builtin_amdgcn_sched_barrier(0);
                            }
                            // warp GEMM
-                            typename GemmTraits::WarpGemm{}(c_warp_tensor,
-                                                            a_warp_tiles_[mIter][kInnerIter],
-                                                            b_warp_tiles_[nIter][kInnerIter]);
+                            WarpGemm{}(c_warp_tensor,
+                                       a_warp_tiles_[mIter][kInnerIter],
+                                       b_warp_tiles_[nIter][kInnerIter]);

                            // write C warp tensor into C block tensor
                            c_block_tensor.set_y_sliced_thread_data(
@@ -632,7 +623,7 @@ struct BlockUniversalGemmAsBsCr
    CK_TILE_DEVICE void LocalPrefetch(const ASmemBlockWindow& a_block_window,
                                      const BSmemBlockWindow& b_block_window)
    {
-        block_gemm_impl_.template LocalPrefetch(a_block_window, b_block_window);
+        block_gemm_impl_.LocalPrefetch(a_block_window, b_block_window);
    }

    // C += A * B
@@ -641,7 +632,7 @@ struct BlockUniversalGemmAsBsCr
                                   const ASmemBlockWindow& a_block_window,
                                   const BSmemBlockWindow& b_block_window)
    {
-        block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window);
+        block_gemm_impl_(c_block_tensor, a_block_window, b_block_window);
    }

    // C = A * B
@@ -650,7 +641,7 @@ struct BlockUniversalGemmAsBsCr
                                   const BSmemBlockWindow& b_block_window)
    {
        auto c_block_tensor = MakeCBlockTile();
-        block_gemm_impl_.template operator()(c_block_tensor, a_block_window, b_block_window);
+        block_gemm_impl_(c_block_tensor, a_block_window, b_block_window);
        return c_block_tensor;
    }