Merge branch 'develop' into codegen_hiprtc

13257d66 · arai713 · GitHub · 0b33037b · 5affda81 · 13257d66
Unverified Commit 13257d66 authored Dec 03, 2024 by arai713 Committed by GitHub Dec 03, 2024
20 changed files
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -549,8 +549,10 @@ __device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src
            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, f8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, bf8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, fp8_storage_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
        "wrong! not implemented");

@@ -843,8 +845,8 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,

 #else

-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
-        src_wave_buffer_resource, src_thread_addr_offset, 0);
+    vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0)};
    return src_thread_element_valid ? tmp : vector_t(0);
 #endif
 }
@@ -873,8 +875,8 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,

    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;

-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
-        src_wave_buffer_resource, src_thread_addr_offset, 0);
+    vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0)};

    return src_thread_element_valid ? tmp : vector_t(customized_value);
 }

--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -4,7 +4,7 @@
 #pragma once

 namespace ck {
-// Define the common macro for gfx94x models
+// Define the common macro for MI300 models
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 #define __gfx94__
 #endif

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -80,7 +80,7 @@ static inline __host__ bool isnan(half_t x)
    return (xx & 0x7FFF) > 0x7C00;
 };

-static inline __host__ bool isnan(f8_t x) { return (x & 0x80); };
+static inline __host__ bool isnan(f8_t x) { return ck::fp8_is_nan(x); };

 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 static inline __host__ bool isnan(int4_t x)
@@ -531,7 +531,7 @@ static inline __device__ bool isnan(half_t x)
    return (xx & 0x7FFF) > 0x7C00;
 };

-static inline __device__ bool isnan(f8_t x) { return (x & 0x80); };
+static inline __device__ bool isnan(f8_t x) { return ck::fp8_is_nan(x); };

 static inline __device__ half_t sqrt(half_t x)
 {

--- a/include/ck/utility/random_gen.hpp
+++ b/include/ck/utility/random_gen.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
 #include <ck/utility/ignore.hpp>

+#include "ck/ck.hpp"
+
 namespace ck {

 // Pseudo random number generator
@@ -24,7 +26,7 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 }

 // version for fp16
-template <typename T, uint32_t seed_t, ck::enable_if_t<std::is_same<half_t, T>{}, bool> = false>
+template <typename T, uint32_t seed_t, ck::enable_if_t<std::is_same<_Float16, T>{}, bool> = false>
 __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed = seed_t)
 {
    uint16_t x         = *(reinterpret_cast<uint16_t*>(&val));
@@ -39,9 +41,10 @@ __host__ __device__ uint32_t prand_generator(index_t id, T val, uint32_t seed =
 }

 // return 0 if data is not fp16 or fp32
-template <typename T,
+template <
+    typename T,
    uint32_t seed_t,
-          ck::enable_if_t<!(std::is_same<float, T>{} || std::is_same<half_t, T>{}), bool> = false>
+    ck::enable_if_t<!(std::is_same<float, T>{} || std::is_same<_Float16, T>{}), bool> = false>
 __host__ __device__ uint32_t prand_generator(int id, T val, uint32_t seed = seed_t)
 {
    ck::ignore = id;

--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -9,7 +9,7 @@
 #include "ck/utility/array.hpp"

 namespace ck {
-// Define the common macro for gfx94x models
+// Define the common macro for MI300 models
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 #define __gfx94__
 #endif
@@ -100,6 +100,18 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
    return type_convert<bhalf_t>(x_fp32);
 }

+template <>
+inline __host__ __device__ constexpr f8_ocp_t type_convert<f8_ocp_t, int>(int x)
+{
+    return f8_ocp_t{type_convert<f8_ocp_t::data_type>(x)};
+}
+
+template <>
+inline __host__ __device__ constexpr bf8_ocp_t type_convert<bf8_ocp_t, int>(int x)
+{
+    return bf8_ocp_t{type_convert<bf8_ocp_t::data_type>(x)};
+}
+
 // Convert X to Y
 template <typename Y, typename X>
 __host__ __device__ constexpr Y type_convert_sp(X x)
@@ -163,7 +175,7 @@ __host__ __device__ constexpr Y f8_convert_sr(X x);

 // convert fp32 to fp8 with stochastic rounding
 template <>
-inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
+inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, float>(float x)
 {
    constexpr int seed = 1254739;
    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
@@ -189,33 +201,36 @@ inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    return utils::
-        cast_to_f8<float, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(x,
-                                                                                               rng);
+        cast_to_f8<float, f8_fnuz_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
+            x, rng);
 #endif
 }

 // convert fp16 to fp8 with stochastic rounding
 template <>
-inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
+inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, half_t>(half_t x)
 {
 #if defined(__gfx94__)
    // convert to float and use native converion
-    return f8_convert_sr<f8_t>(type_convert<float>(x));
+    return f8_convert_sr<f8_fnuz_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 1254739;
-    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<size_t>(&x), x);
-    return utils::
-        cast_to_f8<half_t, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+
+    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    return utils::cast_to_f8<half_t,
+                             f8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }

 // convert fp32 to bf8 with stochastic rounding
 template <>
-inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, float>(float x)
+inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, float>(float x)
 {
    constexpr int seed = 1254739;
    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
@@ -240,28 +255,33 @@ inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, float>(float x)
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
-    return utils::
-        cast_to_f8<float, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<float,
+                             bf8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }

 // convert fp16 to bf8 with stochastic rounding
 template <>
-inline __host__ __device__ bf8_t f8_convert_sr<bf8_t, half_t>(half_t x)
+inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, half_t>(half_t x)
 {
 #if defined(__gfx94__)
    // convert to float and use native converion
-    return f8_convert_sr<bf8_t>(type_convert<float>(x));
+    return f8_convert_sr<bf8_fnuz_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 1254739;
-    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<size_t>(&x), x);
-    return utils::
-        cast_to_f8<half_t, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+
+    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    return utils::cast_to_f8<half_t,
+                             bf8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }

@@ -271,7 +291,7 @@ __host__ __device__ constexpr Y f8_convert_rne(X x);

 // convert fp32 to fp8 with rounding to nearest even
 template <>
-inline __host__ __device__ f8_t f8_convert_rne<f8_t, float>(float x)
+inline __host__ __device__ f8_fnuz_t f8_convert_rne<f8_fnuz_t, float>(float x)
 {
 #if defined(__gfx94__)
    union
@@ -296,32 +316,34 @@ inline __host__ __device__ f8_t f8_convert_rne<f8_t, float>(float x)
    constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
    constexpr uint32_t rng           = 0;
    return utils::
-        cast_to_f8<float, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(x,
-                                                                                               rng);
+        cast_to_f8<float, f8_fnuz_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
+            x, rng);
 #endif
 }

 // convert fp16 to fp8 with rounding to nearest even
 template <>
-inline __host__ __device__ f8_t f8_convert_rne<f8_t, half_t>(half_t x)
+inline __host__ __device__ f8_fnuz_t f8_convert_rne<f8_fnuz_t, half_t>(half_t x)
 {
 #if defined(__gfx94__)
    // convert to float and use native converion
-    return f8_convert_rne<f8_t>(type_convert<float>(x));
+    return f8_convert_rne<f8_fnuz_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
    constexpr uint32_t rng           = 0;
-    return utils::
-        cast_to_f8<half_t, f8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<half_t,
+                             f8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }

 // convert fp32 to bf8 with rounding to nearest even
 template <>
-inline __host__ __device__ bf8_t f8_convert_rne<bf8_t, float>(float x)
+inline __host__ __device__ bf8_fnuz_t f8_convert_rne<bf8_fnuz_t, float>(float x)
 {
 #if defined(__gfx94__)
    union
@@ -345,44 +367,59 @@ inline __host__ __device__ bf8_t f8_convert_rne<bf8_t, float>(float x)
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
    constexpr uint32_t rng           = 0;
-    return utils::
-        cast_to_f8<float, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<float,
+                             bf8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
 #endif
 }

 // convert fp16 to bf8 with rounding to nearest even
 template <>
-inline __host__ __device__ bf8_t f8_convert_rne<bf8_t, half_t>(half_t x)
+inline __host__ __device__ bf8_fnuz_t f8_convert_rne<bf8_fnuz_t, half_t>(half_t x)
 {
 #if defined(__gfx94__)
    // convert to float and use native converion
-    return f8_convert_rne<bf8_t>(type_convert<float>(x));
+    return f8_convert_rne<bf8_fnuz_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
    constexpr uint32_t rng           = 0;
-    return utils::
-        cast_to_f8<half_t, bf8_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
-            x, rng);
+    return utils::cast_to_f8<half_t,
+                             bf8_fnuz_t,
+                             negative_zero_nan,
+                             clip,
+                             (rm == f8_rounding_mode::stochastic)>(x, rng);
+#endif
+}
+
+// convert fp32 to fp8
+template <>
+inline __host__ __device__ f8_fnuz_t type_convert<f8_fnuz_t, float>(float x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<f8_fnuz_t>(x);
+#else
+    return f8_convert_rne<f8_fnuz_t>(x);
 #endif
 }

 // convert fp32 to fp8
 template <>
-inline __host__ __device__ f8_t type_convert<f8_t, float>(float x)
+inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, float>(float x)
 {
 #if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<f8_t>(x);
+    return f8_convert_sr<f8_ocp_t>(x);
 #else
-    return f8_convert_rne<f8_t>(x);
+    return f8_convert_rne<f8_ocp_t>(x);
 #endif
 }

 // convert fp8 to fp32
 template <>
-inline __host__ __device__ float type_convert<float, f8_t>(f8_t x)
+inline __host__ __device__ float type_convert<float, f8_fnuz_t>(f8_fnuz_t x)
 {
 #if defined(__gfx94__)
    float fval;
@@ -392,30 +429,44 @@ inline __host__ __device__ float type_convert<float, f8_t>(f8_t x)
    return fval;
 #else
    constexpr bool negative_zero_nan = true;
-    return utils::cast_from_f8<f8_t, float, negative_zero_nan>(x);
+    return utils::cast_from_f8<f8_fnuz_t, float, negative_zero_nan>(x);
 #endif
 }

 template <>
-inline __host__ __device__ float2_t type_convert<float2_t, f8x2_t>(f8x2_t x)
+inline __host__ __device__ float2_t type_convert<float2_t, f8x2_fnuz_t>(f8x2_fnuz_t x)
 {
 #if defined(__gfx94__)
    const auto i16val = bit_cast<uint16_t>(x);
    return __builtin_amdgcn_cvt_pk_f32_fp8(i16val, 0);
 #else
    constexpr bool negative_zero_nan = true;
-    const auto f8x2_v                = vector_type<f8_t, 2>(x);
+    const auto f8x2_v                = vector_type<f8_fnuz_t, 2>(x);
    vector_type<float, 2> f32x2_v;
    f32x2_v.template AsType<float>()(Number<0>{}) =
-        utils::cast_from_f8<f8_t, float, negative_zero_nan>(
-            f8x2_v.template AsType<f8_t>()[Number<0>{}]);
+        utils::cast_from_f8<f8_fnuz_t, float, negative_zero_nan>(
+            f8x2_v.template AsType<f8_fnuz_t>()[Number<0>{}]);
    f32x2_v.template AsType<float>()(Number<1>{}) =
-        utils::cast_from_f8<f8_t, float, negative_zero_nan>(
-            f8x2_v.template AsType<f8_t>()[Number<1>{}]);
+        utils::cast_from_f8<f8_fnuz_t, float, negative_zero_nan>(
+            f8x2_v.template AsType<f8_fnuz_t>()[Number<1>{}]);
    return f32x2_v.template AsType<float2_t>()[Number<0>{}];
 #endif
 }

+template <>
+inline __host__ __device__ float2_t type_convert<float2_t, f8x2_ocp_t>(f8x2_ocp_t x)
+{
+#if CK_OCP_FP8_CVT_FAST_PATH
+    return fp8_impl::cast_to_f32x2_from_f8x2<f8_ocp_t::default_interpret>(
+        x.AsType<fp8_impl::fp8x2_storage_t>()[Number<0>{}]);
+#else
+    return float2_t{fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(
+                        x.AsType<fp8_storage_t>()[Number<0>{}]),
+                    fp8_impl::cast_from_f8<float, f8_ocp_t::wm, f8_ocp_t::we, false>(
+                        x.AsType<fp8_storage_t>()[Number<1>{}])};
+#endif
+}
+
 template <>
 inline __host__ __device__ half2_t type_convert<half2_t, float2_t>(float2_t x)
 {
@@ -428,42 +479,64 @@ inline __host__ __device__ half2_t type_convert<half2_t, float2_t>(float2_t x)

 // convert fp16 to fp8
 template <>
-inline __host__ __device__ f8_t type_convert<f8_t, half_t>(half_t x)
+inline __host__ __device__ f8_fnuz_t type_convert<f8_fnuz_t, half_t>(half_t x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<f8_fnuz_t>(x);
+#else
+    return f8_convert_rne<f8_fnuz_t>(x);
+#endif
+}
+
+// convert fp16 to fp8
+template <>
+inline __host__ __device__ f8_ocp_t type_convert<f8_ocp_t, half_t>(half_t x)
 {
 #if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<f8_t>(x);
+    return f8_convert_sr<f8_ocp_t>(x);
 #else
-    return f8_convert_rne<f8_t>(x);
+    return f8_convert_rne<f8_ocp_t>(x);
 #endif
 }

 // convert fp8 to fp16
 template <>
-inline __host__ __device__ half_t type_convert<half_t, f8_t>(f8_t x)
+inline __host__ __device__ half_t type_convert<half_t, f8_fnuz_t>(f8_fnuz_t x)
 {
 #if defined(__gfx94__)
    // use native conversion to float and convert to fp16
    return type_convert<half_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
-    return utils::cast_from_f8<f8_t, half_t, negative_zero_nan>(x);
+    return utils::cast_from_f8<f8_fnuz_t, half_t, negative_zero_nan>(x);
 #endif
 }

 // convert fp32 to bf8
 template <>
-inline __host__ __device__ bf8_t type_convert<bf8_t, float>(float x)
+inline __host__ __device__ bf8_fnuz_t type_convert<bf8_fnuz_t, float>(float x)
 {
 #if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<bf8_t>(x);
+    return f8_convert_sr<bf8_fnuz_t>(x);
 #else
-    return f8_convert_rne<bf8_t>(x);
+    return f8_convert_rne<bf8_fnuz_t>(x);
+#endif
+}
+
+// convert fp32 to bf8
+template <>
+inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, float>(float x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<bf8_ocp_t>(x);
+#else
+    return f8_convert_rne<bf8_ocp_t>(x);
 #endif
 }

 // convert bf8 to fp32
 template <>
-inline __host__ __device__ float type_convert<float, bf8_t>(bf8_t x)
+inline __host__ __device__ float type_convert<float, bf8_fnuz_t>(bf8_fnuz_t x)
 {
 #if defined(__gfx94__)
    float fval;
@@ -473,31 +546,42 @@ inline __host__ __device__ float type_convert<float, bf8_t>(bf8_t x)
    return fval;
 #else
    constexpr bool negative_zero_nan = true;
-    return utils::cast_from_f8<bf8_t, float, negative_zero_nan>(x);
+    return utils::cast_from_f8<bf8_fnuz_t, float, negative_zero_nan>(x);
+#endif
+}
+
+// convert fp16 to bf8
+template <>
+inline __host__ __device__ bf8_fnuz_t type_convert<bf8_fnuz_t, half_t>(half_t x)
+{
+#if CK_USE_SR_F8_CONVERSION
+    return f8_convert_sr<bf8_fnuz_t>(x);
+#else
+    return f8_convert_rne<bf8_fnuz_t>(x);
 #endif
 }

 // convert fp16 to bf8
 template <>
-inline __host__ __device__ bf8_t type_convert<bf8_t, half_t>(half_t x)
+inline __host__ __device__ bf8_ocp_t type_convert<bf8_ocp_t, half_t>(half_t x)
 {
 #if CK_USE_SR_F8_CONVERSION
-    return f8_convert_sr<bf8_t>(x);
+    return f8_convert_sr<bf8_ocp_t>(x);
 #else
-    return f8_convert_rne<bf8_t>(x);
+    return f8_convert_rne<bf8_ocp_t>(x);
 #endif
 }

 // convert bf8 to fp16
 template <>
-inline __host__ __device__ half_t type_convert<half_t, bf8_t>(bf8_t x)
+inline __host__ __device__ half_t type_convert<half_t, bf8_fnuz_t>(bf8_fnuz_t x)
 {
 #if defined(__gfx94__)
    // use native conversion to float and convert to fp16
    return type_convert<half_t>(type_convert<float>(x));
 #else
    constexpr bool negative_zero_nan = true;
-    return utils::cast_from_f8<bf8_t, half_t, negative_zero_nan>(x);
+    return utils::cast_from_f8<bf8_fnuz_t, half_t, negative_zero_nan>(x);
 #endif
 }


--- a/include/ck_tile/README.md
+++ b/include/ck_tile/README.md
-# ck_tile
+[Back to the main page](../../README.md)
+# Composable Kernel Tile
 ## concept
 `ck_tile` provides a programming model with templated abstractions to enable users to implement performance-critical kernels for machine learning workloads. introduces following basic concepts to help users building your own operator
 - tensor coordinate transformation, this is the core concept of layout/index transform abstraction in both compiler time and run time.

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -62,9 +62,9 @@ struct ReferenceGemm : public device::BaseOperator
            auto f_mk_kn_mn = [&](auto m, auto n) {
                const int K = arg.a_m_k_.mDesc.GetLengths()[1];

-                AccDataType v_acc = 0;
-                ComputeTypeA v_a  = 0;
-                ComputeTypeB v_b  = 0;
+                AccDataType v_acc{0};
+                ComputeTypeA v_a{0};
+                ComputeTypeB v_b{0};

                for(int k = 0; k < K; ++k)
                {
@@ -93,7 +93,7 @@ struct ReferenceGemm : public device::BaseOperator
                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
                }

-                CDataType v_c = 0;
+                CDataType v_c{0};

                arg.c_element_op_(v_c, v_acc);


--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
--- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp
@@ -15,7 +15,7 @@ void add_device_pool3d_fwd_ndhwc_f8_instances(
        instances)
 {
    add_device_operation_instances(
-        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, false>{});
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F32, ReduceOpId, false>{});
 }

 void add_device_pool3d_fwd_ndhwc_index_f8_instances(
@@ -23,7 +23,7 @@ void add_device_pool3d_fwd_ndhwc_index_f8_instances(
        instances)
 {
    add_device_operation_instances(
-        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F8, ReduceOpId, true>{});
+        instances, device_pool3d_fwd_ndhwc_instances<F8, F8, I32, F32, ReduceOpId, true>{});
 }

 } // namespace instance

--- a/profiler/README.md
+++ b/profiler/README.md
+[Back to the main page](../README.md)
+# Composable Kernel profiler
 ## Profile GEMM kernels
 ```bash
 #arg1: tensor operation (gemm=GEMM)
@@ -180,3 +182,13 @@ Note: Column to image kernel adds to the output memory, this will cause output b
 ################            op datatype  verify  init  log  time  dim0 dim1 dim2 in_stride0 in_stride1 in_stride2 out_stride0 out_stride1 out_stride2
 ./bin/ckProfiler permute_scale        0       1     1    0     1    64   64   64       4096         64          1           1          64        4096
 ```
+
+## Convert MIOpen driver command to CKProfiler
+
+```bash
+python3 ../script/convert_miopen_driver_to_profiler.py
+/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 -k 64 -y 3 -x 3
+-p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 32 -F 1 -t 1 
+```
+
+Only convolution driver is supported.
--- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -150,7 +150,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
        break;
    default:
        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
        d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<D0DataType>{1});
    }

--- a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -157,7 +157,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
        break;
    default:
        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
    }


--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -174,7 +174,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
        break;
    default:
        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
    }


--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -140,7 +140,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
        break;
    default:
        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
    }


--- a/profiler/include/profiler/profile_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -74,8 +74,8 @@ int profile_gemm_impl(int do_verification,
    switch(init_method)
    {
    case 0:
-        ck::utils::FillConstant<ADataType>{static_cast<ADataType>(1.f)}(a_m_k);
-        ck::utils::FillConstant<BDataType>{static_cast<BDataType>(1.f)}(b_k_n);
+        ck::utils::FillConstant<ADataType>{type_convert<ADataType>(1.f)}(a_m_k);
+        ck::utils::FillConstant<BDataType>{type_convert<BDataType>(1.f)}(b_k_n);
        break;
    case 1:
        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);

--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
@@ -9,13 +9,38 @@ if (USE_BITINT_EXTENSION_INT4)
  endif()
 endif()

-add_gtest_executable(test_fp8 test_fp8.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_fp8 PRIVATE utility)
+
+
+add_custom_target(test_fp8)
+
+if (CK_USE_OCP_FP8)
+  add_gtest_executable(test_fp8_ocp test_fp8_ocp.cpp)
+  if(result EQUAL 0)
+    target_link_libraries(test_fp8_ocp PRIVATE utility)
+  endif()
+
+  add_gtest_executable(test_bf8_ocp test_bf8_ocp.cpp)
+  if(result EQUAL 0)
+    target_link_libraries(test_bf8_ocp PRIVATE utility)
+  endif()
+
+  add_dependencies(test_fp8 test_fp8_ocp)
+  add_dependencies(test_fp8 test_bf8_ocp)
 endif()
-add_gtest_executable(test_bf8 test_bf8.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_bf8 PRIVATE utility)
+
+if (CK_USE_FNUZ_FP8)
+  add_gtest_executable(test_fp8_fnuz test_fp8_fnuz.cpp)
+  if(result EQUAL 0)
+    target_link_libraries(test_fp8_fnuz PRIVATE utility)
+  endif()
+
+  add_gtest_executable(test_bf8_fnuz test_bf8_fnuz.cpp)
+  if(result EQUAL 0)
+    target_link_libraries(test_bf8_fnuz PRIVATE utility)
+  endif()
+
+  add_dependencies(test_fp8 test_fp8_fnuz)
+  add_dependencies(test_fp8 test_bf8_fnuz)
 endif()

 add_gtest_executable(test_custom_type test_custom_type.cpp)

--- a/test/data_type/test_bf8.cpp
+++ b/test/data_type/test_bf8.cpp
@@ -5,158 +5,169 @@
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"

-using ck::bf8_t;
+using ck::bf8_fnuz_t;
 using ck::f8_convert_rne;
 using ck::f8_convert_sr;
 using ck::half_t;
 using ck::type_convert;

-TEST(BF8, NumericLimits)
+TEST(BF8FNUZ, NumericLimits)
 {
    // constants given for negative zero nan mode
-    EXPECT_EQ(ck::NumericLimits<bf8_t>::Min(), type_convert<bf8_t>(0x04));
-    EXPECT_EQ(ck::NumericLimits<bf8_t>::Max(), type_convert<bf8_t>(0x7F));
-    EXPECT_EQ(ck::NumericLimits<bf8_t>::Lowest(), type_convert<bf8_t>(0xFF));
-    EXPECT_EQ(ck::NumericLimits<bf8_t>::QuietNaN(), type_convert<bf8_t>(0x80));
+    EXPECT_EQ(ck::NumericLimits<bf8_fnuz_t>::Min(), type_convert<bf8_fnuz_t>(0x04));
+    EXPECT_EQ(ck::NumericLimits<bf8_fnuz_t>::Max(), type_convert<bf8_fnuz_t>(0x7F));
+    EXPECT_EQ(ck::NumericLimits<bf8_fnuz_t>::Lowest(), type_convert<bf8_fnuz_t>(0xFF));
+    EXPECT_EQ(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(), type_convert<bf8_fnuz_t>(0x80));
 }

-TEST(BF8, ConvertFP32Nearest)
+TEST(BF8FNUZ, ConvertFP32Nearest)
 {
    // fix the tolerance value
    float abs_tol = 1e-6;
    // convert 0 float to bf8 and back, check if holds
-    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<bf8_t>(0.0f)), abs_tol);
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(0.0f)), abs_tol);
    // don't run the next test on gfx11 devices
 #ifndef CK_SKIP_FLAKY_F8_TEST
    // convert minimal float to bf8 and back, check if holds
    ASSERT_NEAR(std::numeric_limits<float>::min(),
-                type_convert<float>(f8_convert_rne<bf8_t>(std::numeric_limits<float>::min())),
+                type_convert<float>(f8_convert_rne<bf8_fnuz_t>(std::numeric_limits<float>::min())),
                abs_tol);
 #endif
-    // convert maximal bf8_t to float and check if equal to 57344.0
-    ASSERT_NEAR(57344.0f, type_convert<float>(f8_convert_rne<bf8_t>(57344.0f)), abs_tol);
+
+    const auto max_bf8_t_float = type_convert<float>(ck::NumericLimits<bf8_fnuz_t>::Max());
+    // convert maximal bf8_fnuz_t to float and check if equal to 57344.0
+    ASSERT_NEAR(
+        max_bf8_t_float, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(max_bf8_t_float)), abs_tol);
    // convert maximal float to bf8 and back, check if clipped to 57344.0
-    ASSERT_NEAR(57344.0f,
-                type_convert<float>(f8_convert_rne<bf8_t>(std::numeric_limits<float>::max())),
+    ASSERT_NEAR(max_bf8_t_float,
+                type_convert<float>(f8_convert_rne<bf8_fnuz_t>(std::numeric_limits<float>::max())),
                abs_tol);
-    // convert inf float to bf8_t and check if it is qNan
-    ASSERT_NEAR(type_convert<bf8_t>(0x80),
-                f8_convert_rne<bf8_t>(std::numeric_limits<float>::infinity()),
+    // convert inf float to bf8_fnuz_t and check if it is qNan
+    ASSERT_NEAR(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
+                f8_convert_rne<bf8_fnuz_t>(std::numeric_limits<float>::infinity()),
                abs_tol);
    // positive norm float value to bf8 and back, check if holds
    float pos_float = 0.0000762939f;
-    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(pos_float)), abs_tol);
    // negative norm float value to bf8 and back, check if holds
    float neg_float = -0.0000610351f;
-    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<bf8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(neg_float)), abs_tol);
    // positive subnorm float value to bf8 and back, check if holds
    pos_float = 0.0000305175f;
-    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(pos_float)), abs_tol);
    // negative subnorm float value to bf8 and back, check if holds
    neg_float = -0.0000152587f;
-    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<bf8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_rne<bf8_fnuz_t>(neg_float)), abs_tol);
 }

-TEST(BF8, ConvertFP32Stochastic)
+TEST(BF8FNUZ, ConvertFP32Stochastic)
 {
    // fix the tolerance value
    float abs_tol = 1e-6;
    // convert 0 float to bf8 and back, check if holds
-    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_sr<bf8_t>(0.0f)), abs_tol);
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(0.0f)), abs_tol);
    // convert minimal float to bf8 and back, check if holds
    ASSERT_NEAR(std::numeric_limits<float>::min(),
-                type_convert<float>(f8_convert_sr<bf8_t>(std::numeric_limits<float>::min())),
+                type_convert<float>(f8_convert_sr<bf8_fnuz_t>(std::numeric_limits<float>::min())),
                abs_tol);
-    // convert maximal bf8_t to float and check if equal to 57344.0
-    ASSERT_NEAR(57344.0f, type_convert<float>(f8_convert_sr<bf8_t>(57344.0f)), abs_tol);
+
+    const auto max_bf8_t_float = type_convert<float>(ck::NumericLimits<bf8_fnuz_t>::Max());
+    // convert maximal bf8_fnuz_t to float and check if equal to 57344.0
+    ASSERT_NEAR(
+        max_bf8_t_float, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(max_bf8_t_float)), abs_tol);
    // convert maximal float to bf8 and back, check if clipped to 57344.0
-    ASSERT_NEAR(57344.0f,
-                type_convert<float>(f8_convert_sr<bf8_t>(std::numeric_limits<float>::max())),
+    ASSERT_NEAR(max_bf8_t_float,
+                type_convert<float>(f8_convert_sr<bf8_fnuz_t>(std::numeric_limits<float>::max())),
                abs_tol);
-    // convert inf float to bf8_t and check if it is qNan
-    ASSERT_NEAR(type_convert<bf8_t>(0x80),
-                f8_convert_sr<bf8_t>(std::numeric_limits<float>::infinity()),
+    // convert inf float to bf8_fnuz_t and check if it is qNan
+    ASSERT_NEAR(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
+                f8_convert_sr<bf8_fnuz_t>(std::numeric_limits<float>::infinity()),
                abs_tol);
    // positive norm float value to bf8 and back, check if holds
    float pos_float = 0.0000762939f;
-    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(pos_float)), abs_tol);
    // negative norm float value to bf8 and back, check if holds
    float neg_float = -0.0000610351f;
-    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<bf8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(neg_float)), abs_tol);
    // positive subnorm float value to bf8 and back, check if holds
    pos_float = 0.0000305175f;
-    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_t>(pos_float)), abs_tol);
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(pos_float)), abs_tol);
    // negative subnorm float value to bf8 and back, check if holds
    neg_float = -0.0000152587f;
-    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<bf8_t>(neg_float)), abs_tol);
+    ASSERT_NEAR(neg_float, type_convert<float>(f8_convert_sr<bf8_fnuz_t>(neg_float)), abs_tol);
 }

-TEST(BF8, ConvertFP16Nearest)
+TEST(BF8FNUZ, ConvertFP16Nearest)
 {
    // fix the tolerance value
    float abs_tol = 1e-3;
    // convert 0 fp16 to bf8 and back, check if holds
-    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_rne<bf8_t>(half_t{0.0})), abs_tol);
+    ASSERT_NEAR(
+        half_t{0.0}, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(half_t{0.0})), abs_tol);
    // convert minimal fp16 to bf8 and back, check if holds
    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
-                type_convert<half_t>(f8_convert_rne<bf8_t>(ck::NumericLimits<half_t>::Min())),
+                type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(ck::NumericLimits<half_t>::Min())),
                abs_tol);
-    // convert maximal bf8_t to fp16 and check if equal to 57344.0
+
+    const auto max_bf8_t_half = type_convert<half_t>(ck::NumericLimits<bf8_fnuz_t>::Max());
+    // convert maximal bf8_fnuz_t to fp16 and check if equal to 57344.0
    ASSERT_NEAR(
-        half_t{57344.0}, type_convert<half_t>(f8_convert_rne<bf8_t>(half_t{57344.0})), abs_tol);
+        max_bf8_t_half, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(max_bf8_t_half)), abs_tol);
    // convert maximal fp16 to bf8 and back, check if clipped to 57344.0
-    ASSERT_NEAR(half_t{57344.0},
-                type_convert<half_t>(f8_convert_rne<bf8_t>(ck::NumericLimits<half_t>::Max())),
+    ASSERT_NEAR(max_bf8_t_half,
+                type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(ck::NumericLimits<half_t>::Max())),
                abs_tol);
-    // convert QuietNaN fp16 to bf8_t and check if it is QuietNaN
-    ASSERT_NEAR(type_convert<bf8_t>(0x80),
-                f8_convert_rne<bf8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+    // convert QuietNaN fp16 to bf8_fnuz_t and check if it is QuietNaN
+    ASSERT_NEAR(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
+                f8_convert_rne<bf8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()),
                abs_tol);
    // positive norm fp16 value to bf8 and back, check if holds
    half_t pos_half = half_t{0.0000762939};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(pos_half)), abs_tol);
    // negative norm fp16 value to bf8 and back, check if holds
    half_t neg_half = half_t{-0.0000610351};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<bf8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(neg_half)), abs_tol);
    // positive subnorm fp16 value to bf8 and back, check if holds
    pos_half = half_t{0.0000305175};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(pos_half)), abs_tol);
    // negative subnorm fp16 value to bf8 and back, check if holds
    neg_half = half_t{-0.0000152587};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<bf8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_rne<bf8_fnuz_t>(neg_half)), abs_tol);
 }

-TEST(BF8, ConvertFP16Stochastic)
+TEST(BF8FNUZ, ConvertFP16Stochastic)
 {
    // fix the tolerance value
    float abs_tol = 1e-3;
    // convert 0 fp16 to bf8 and back, check if holds
-    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_sr<bf8_t>(half_t{0.0})), abs_tol);
+    ASSERT_NEAR(half_t{0.0}, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(half_t{0.0})), abs_tol);
    // convert minimal fp16 to bf8 and back, check if holds
    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
-                type_convert<half_t>(f8_convert_sr<bf8_t>(ck::NumericLimits<half_t>::Min())),
+                type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(ck::NumericLimits<half_t>::Min())),
                abs_tol);
-    // convert maximal bf8_t to fp16 and check if equal to 57344.0
+
+    const auto max_bf8_t_half = type_convert<half_t>(ck::NumericLimits<bf8_fnuz_t>::Max());
+    // convert maximal bf8_fnuz_t to fp16 and check if equal to 57344.0
    ASSERT_NEAR(
-        half_t{57344.0}, type_convert<half_t>(f8_convert_sr<bf8_t>(half_t{57344.0})), abs_tol);
+        max_bf8_t_half, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(max_bf8_t_half)), abs_tol);
    // convert maximal fp16 to bf8 and back, check if clipped to 57344.0
-    ASSERT_NEAR(half_t{57344.0},
-                type_convert<half_t>(f8_convert_sr<bf8_t>(ck::NumericLimits<half_t>::Max())),
+    ASSERT_NEAR(max_bf8_t_half,
+                type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(ck::NumericLimits<half_t>::Max())),
                abs_tol);
-    // convert QuietNaN fp16 to bf8_t and check if it is QuietNaN
-    ASSERT_NEAR(type_convert<bf8_t>(0x80),
-                f8_convert_sr<bf8_t>(ck::NumericLimits<half_t>::QuietNaN()),
+    // convert QuietNaN fp16 to bf8_fnuz_t and check if it is QuietNaN
+    ASSERT_NEAR(ck::NumericLimits<bf8_fnuz_t>::QuietNaN(),
+                f8_convert_sr<bf8_fnuz_t>(ck::NumericLimits<half_t>::QuietNaN()),
                abs_tol);
    // positive norm fp16 value to bf8 and back, check if holds
    half_t pos_half = half_t{0.0000762939};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<bf8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(pos_half)), abs_tol);
    // negative norm fp16 value to bf8 and back, check if holds
    half_t neg_half = half_t{-0.0000610351};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<bf8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(neg_half)), abs_tol);
    // positive subnorm fp16 value to bf8 and back, check if holds
    pos_half = half_t{0.0000305175};
-    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<bf8_t>(pos_half)), abs_tol);
+    ASSERT_NEAR(pos_half, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(pos_half)), abs_tol);
    // negative subnorm fp16 value to bf8 and back, check if holds
    neg_half = half_t{-0.0000152587};
-    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<bf8_t>(neg_half)), abs_tol);
+    ASSERT_NEAR(neg_half, type_convert<half_t>(f8_convert_sr<bf8_fnuz_t>(neg_half)), abs_tol);
 }
--- a/test/data_type/test_bf8_ocp.cpp
+++ b/test/data_type/test_bf8_ocp.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type_convert.hpp"
+
+using ck::bf8_ocp_t;
+using ck::f8_convert_rne;
+using ck::f8_convert_sr;
+using ck::half_t;
+using ck::type_convert;
+
+TEST(BF8OCP, NumericLimits)
+{ // constants given for OCP FP8
+    EXPECT_EQ(ck::NumericLimits<bf8_ocp_t>::Min(),
+              type_convert<bf8_ocp_t>(0x04)); // 0b00000100 = 2^-14
+    EXPECT_EQ(ck::NumericLimits<bf8_ocp_t>::Max(),
+              type_convert<bf8_ocp_t>(0x7B)); // 0b01111011 = 57344
+    EXPECT_EQ(ck::NumericLimits<bf8_ocp_t>::Lowest(),
+              type_convert<bf8_ocp_t>(0xFB)); // 0b11111011 = -57344
+    EXPECT_EQ(ck::NumericLimits<bf8_ocp_t>::QuietNaN().data,
+              type_convert<bf8_ocp_t>(0x7D).data); // 0b01111101
+    EXPECT_FALSE(ck::NumericLimits<bf8_ocp_t>::QuietNaN() ==
+                 ck::NumericLimits<bf8_ocp_t>::QuietNaN());
+    EXPECT_TRUE(ck::fp8_is_inf(type_convert<bf8_ocp_t>(0xFC)) &&
+                ck::fp8_is_inf(type_convert<bf8_ocp_t>(0x7C)));
+}
+
+TEST(BF8OCP, ConvertFP32Nearest)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-6;
+
+    // convert 0 float to bfp8 and back, check if holds
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_rne<bf8_ocp_t>(0.0f)), 0.0f);
+
+    // convert minimal float to bf8 and back, check if holds
+    ASSERT_NEAR(std::numeric_limits<float>::min(),
+                type_convert<float>(f8_convert_rne<bf8_ocp_t>(std::numeric_limits<float>::min())),
+                abs_tol);
+
+    const auto max_bf8_t_float = type_convert<float>(ck::NumericLimits<bf8_ocp_t>::Max());
+
+    // convert maximal bf8_ocp_t to float and check if equal to bf8 max
+    ASSERT_NEAR(
+        max_bf8_t_float, type_convert<float>(f8_convert_rne<bf8_ocp_t>(max_bf8_t_float)), 0.0f);
+
+    // convert maximal float to bf8 and back, check if clipped to bf8 max (saturation to finite)
+    ASSERT_NEAR(max_bf8_t_float,
+                type_convert<float>(f8_convert_rne<bf8_ocp_t>(std::numeric_limits<float>::max())),
+                0.0f);
+
+    // convert float infinity to bf8_ocp_t and check if it is max value (saturation to finite)
+    ASSERT_EQ(ck::NumericLimits<bf8_ocp_t>::Max(),
+              f8_convert_rne<bf8_ocp_t>(std::numeric_limits<float>::infinity()));
+
+    // positive normal float value to bf8 and back, check if holds
+    float pos_float = 0.0000762939f; // 10*2^-17
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_rne<bf8_ocp_t>(pos_float)), abs_tol);
+
+    // negative smallest normal bf8 value to bf8 and back, check if holds
+    constexpr auto neg_min_bf8 = -0.00006103515625f; //-2^-14
+    ASSERT_NEAR(neg_min_bf8, type_convert<float>(f8_convert_rne<bf8_ocp_t>(neg_min_bf8)), 0.0f);
+
+    // positive subnorm float value to bf8 and back, check if holds
+    constexpr auto pos_subnorm_bf8 = 0.000030517578125f; // 2^-15
+    ASSERT_NEAR(
+        pos_subnorm_bf8, type_convert<float>(f8_convert_rne<bf8_ocp_t>(pos_subnorm_bf8)), 0.0f);
+
+    // min subnorm bf8 value to bf8 and back, check if holds
+    constexpr auto min_subnorm_bf8 = -0.0000152587890625f; //-2^-16
+    ASSERT_NEAR(
+        min_subnorm_bf8, type_convert<float>(f8_convert_rne<bf8_ocp_t>(min_subnorm_bf8)), 0.0f);
+
+    // smaller than min subnorm bf8 value to bf8 must be zero
+    constexpr auto less_than_min_subnorm = 0.00000762939453125f; // 2^-17
+    ASSERT_EQ(0.0f, type_convert<float>(f8_convert_rne<bf8_ocp_t>(less_than_min_subnorm)));
+
+    // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN
+    const auto bf8_nan = f8_convert_rne<bf8_ocp_t>(std::numeric_limits<float>::quiet_NaN());
+    ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data));
+}
+
+TEST(BF8OCP, ConvertFP32Stochastic)
+{
+    // fix the tolerance value
+    float abs_tol = 1e-6;
+
+    // convert 0 float to bfp8 and back, check if holds
+    ASSERT_NEAR(0.0f, type_convert<float>(f8_convert_sr<bf8_ocp_t>(0.0f)), 0.0f);
+
+    // convert minimal float to bf8 and back, check if holds
+    ASSERT_NEAR(std::numeric_limits<float>::min(),
+                type_convert<float>(f8_convert_sr<bf8_ocp_t>(std::numeric_limits<float>::min())),
+                abs_tol);
+
+    const auto max_bf8_t_float = type_convert<float>(ck::NumericLimits<bf8_ocp_t>::Max());
+
+    // convert maximal bf8_ocp_t to float and check if equal to bf8 max
+    ASSERT_NEAR(
+        max_bf8_t_float, type_convert<float>(f8_convert_sr<bf8_ocp_t>(max_bf8_t_float)), 0.0f);
+
+    // convert maximal float to bf8 and back, check if clipped to bf8 max (saturation to finite)
+    ASSERT_NEAR(max_bf8_t_float,
+                type_convert<float>(f8_convert_sr<bf8_ocp_t>(std::numeric_limits<float>::max())),
+                0.0f);
+
+    // convert float infinity to bf8_ocp_t and check if it is max value (saturation to finite)
+    ASSERT_EQ(ck::NumericLimits<bf8_ocp_t>::Max(),
+              f8_convert_sr<bf8_ocp_t>(std::numeric_limits<float>::infinity()));
+
+    // positive normal float value to bf8 and back, check if holds
+    float pos_float = 0.0000762939f; // 10*2^-17
+    ASSERT_NEAR(pos_float, type_convert<float>(f8_convert_sr<bf8_ocp_t>(pos_float)), abs_tol);
+
+    // negative smallest normal bf8 value to bf8 and back, check if holds
+    constexpr auto neg_min_bf8 = -0.00006103515625f; //-2^-14
+    ASSERT_NEAR(neg_min_bf8, type_convert<float>(f8_convert_sr<bf8_ocp_t>(neg_min_bf8)), 0.0f);
+
+    // positive subnorm float value to bf8 and back, check if holds
+    constexpr auto pos_subnorm_bf8 = 0.000030517578125f; // 2^-15
+    ASSERT_NEAR(
+        pos_subnorm_bf8, type_convert<float>(f8_convert_sr<bf8_ocp_t>(pos_subnorm_bf8)), 0.0f);
+
+    // min subnorm bf8 value to bf8 and back, check if holds
+    constexpr auto min_subnorm_bf8 = -0.0000152587890625f; //-2^-16
+    ASSERT_NEAR(
+        min_subnorm_bf8, type_convert<float>(f8_convert_sr<bf8_ocp_t>(min_subnorm_bf8)), 0.0f);
+
+    // smaller than min subnorm bf8 value to bf8  alternates between 0 and 2^-16
+    constexpr auto less_than_min_subnorm = 0.00000762939453125f; // 2^-17
+    ASSERT_NEAR(0.0f,
+                type_convert<float>(f8_convert_sr<bf8_ocp_t>(less_than_min_subnorm)),
+                0.0000152587890625f);
+
+    // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN
+    const auto bf8_nan = f8_convert_sr<bf8_ocp_t>(std::numeric_limits<float>::quiet_NaN());
+    ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data));
+}
+
+TEST(BF8OCP, ConvertFP16Nearest)
+{
+    // fix the tolerance value
+    constexpr half_t half_t_tol  = 1e-3;
+    constexpr half_t half_t_zero = 0.0;
+
+    // convert 0 half_t to bfp8 and back, check if holds
+    ASSERT_NEAR(
+        half_t_zero, type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(half_t_zero)), half_t_zero);
+
+    // convert minimal half_t to bf8 and back, check if holds
+    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
+                type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(ck::NumericLimits<half_t>::Min())),
+                half_t_tol);
+
+    const auto max_bf8_t_half_t = type_convert<half_t>(ck::NumericLimits<bf8_ocp_t>::Max());
+
+    // convert maximal bf8_ocp_t to half_t and check if equal to bf8 max
+    ASSERT_NEAR(max_bf8_t_half_t,
+                type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(max_bf8_t_half_t)),
+                half_t_zero);
+
+    // convert maximal half_t to bf8 and back, check if clipped to bf8 max (saturation to finite)
+    ASSERT_NEAR(max_bf8_t_half_t,
+                type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(ck::NumericLimits<half_t>::Max())),
+                half_t_zero);
+
+    // convert half_t infinity to bf8_ocp_t and check if it is max value (saturation to finite)
+    ASSERT_EQ(
+        ck::NumericLimits<bf8_ocp_t>::Max(),
+        f8_convert_rne<bf8_ocp_t>(type_convert<half_t>(std::numeric_limits<float>::infinity())));
+
+    // positive normal bf8 value to bf8 and back, check if holds
+    constexpr half_t pos_norm_bf8{0.0000762939f}; // 10*2^-17
+    ASSERT_NEAR(
+        pos_norm_bf8, type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(pos_norm_bf8)), half_t_tol);
+
+    // negative smallest normal bf8 value to bf8 and back, check if holds
+    constexpr half_t neg_min_bf8{-0.00006103515625f}; //-2^-14
+    ASSERT_NEAR(
+        neg_min_bf8, type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(neg_min_bf8)), half_t_zero);
+
+    // positive subnorm bf8 value to bf8 and back, check if holds
+    constexpr half_t pos_subnorm_bf8{0.000030517578125f}; // 2^-15
+    ASSERT_NEAR(pos_subnorm_bf8,
+                type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(pos_subnorm_bf8)),
+                half_t_zero);
+
+    // min subnorm bf8 value to bf8 and back, check if holds
+    constexpr half_t min_subnorm_bf8{-0.0000152587890625f}; //-2^-16
+    ASSERT_NEAR(min_subnorm_bf8,
+                type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(min_subnorm_bf8)),
+                half_t_zero);
+
+    // smaller than min subnorm bf8 value to bf8 must be zero
+    constexpr half_t less_than_min_subnorm{0.00000762939453125f}; // 2^-17
+    ASSERT_EQ(half_t_zero, type_convert<half_t>(f8_convert_rne<bf8_ocp_t>(less_than_min_subnorm)));
+
+    // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN
+    const auto bf8_nan = f8_convert_rne<bf8_ocp_t>(ck::NumericLimits<half_t>::QuietNaN());
+    ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data));
+}
+
+TEST(BF8OCP, ConvertFP16Stochastic)
+{
+    // fix the tolerance value
+    constexpr half_t half_t_tol    = 1e-3;
+    constexpr half_t half_t_zero   = 0.0;
+    constexpr auto min_subnorm_bf8 = 0.0000152587890625f; // 2^-16
+
+    // convert 0 half_t to bfp8 and back, check if holds
+    ASSERT_NEAR(
+        half_t_zero, type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(half_t_zero)), half_t_zero);
+
+    // convert minimal half_t (6.103515625e-05) to fp8 and back
+    ASSERT_NEAR(ck::NumericLimits<half_t>::Min(),
+                type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(ck::NumericLimits<half_t>::Min())),
+                half_t_zero);
+
+    const auto max_bf8_t_half_t = type_convert<half_t>(ck::NumericLimits<bf8_ocp_t>::Max());
+
+    // convert maximal bf8_ocp_t to half_t and check if equal to bf8 max
+    ASSERT_NEAR(max_bf8_t_half_t,
+                type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(max_bf8_t_half_t)),
+                half_t_zero);
+
+    // convert maximal half_t to bf8 and back, check if clipped to bf8 max (saturation to finite)
+    ASSERT_NEAR(max_bf8_t_half_t,
+                type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(ck::NumericLimits<half_t>::Max())),
+                half_t_zero);
+
+    // convert half_t infinity to bf8_ocp_t and check if it is max value (saturation to finite)
+    ASSERT_EQ(
+        ck::NumericLimits<bf8_ocp_t>::Max(),
+        f8_convert_sr<bf8_ocp_t>(type_convert<half_t>(std::numeric_limits<float>::infinity())));
+
+    // positive normal bf8 value to bf8 and back, check if holds
+    constexpr half_t pos_norm_bf8{0.0000762939f}; // 10*2^-17
+    ASSERT_NEAR(
+        pos_norm_bf8, type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(pos_norm_bf8)), half_t_tol);
+
+    // negative smallest normal bf8 value to bf8 and back, check if holds
+    constexpr half_t neg_min_bf8{-0.00006103515625f}; //-2^-14
+    ASSERT_NEAR(
+        neg_min_bf8, type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(neg_min_bf8)), half_t_zero);
+
+    // positive subnorm bf8 value to bf8 and back, check if holds
+    constexpr half_t pos_subnorm_bf8{0.000030517578125f}; // 2^-15
+    ASSERT_NEAR(pos_subnorm_bf8,
+                type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(pos_subnorm_bf8)),
+                half_t_zero);
+
+    // min subnorm bf8 value to bf8 and back, check if holds
+    ASSERT_NEAR(half_t{-min_subnorm_bf8},
+                type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(half_t{-min_subnorm_bf8})),
+                half_t_zero);
+
+    // smaller than min subnorm bf8 value to bf8  alternates between 0 and 2^-16
+    constexpr half_t less_than_min_subnorm{0.00000762939453125f}; // 2^-17
+    ASSERT_NEAR(half_t_zero,
+                type_convert<half_t>(f8_convert_sr<bf8_ocp_t>(less_than_min_subnorm)),
+                half_t{min_subnorm_bf8});
+
+    // convert quiet NaN to bf8_ocp_t and check if it is quiet NaN
+    const auto bf8_nan = f8_convert_sr<bf8_ocp_t>(ck::NumericLimits<half_t>::QuietNaN());
+    ASSERT_TRUE(ck::fp8_impl::ocp_bf8_is_nan(bf8_nan.data));
+}