Unverified Commit e093146e authored by Rostyslav Geyyer's avatar Rostyslav Geyyer Committed by GitHub
Browse files

Add MXFP6 and MXBF6 conversion methods (#270)

* Add conversions

* Add tests

* Add docstrings

* Add scaled conversions

* Add fp6/bf6 tests

* Remove misleading fp4 test case

* Add docstrings

* Clean up

* Address comments

* Set stricter tolerances for RNE tests

* Add missing tests

* Add native conversions to float

* Revert "Add native conversions to float"

This reverts commit 09467111f73b753c8cc3d597533b187940353dab.

* Update copyright years
parent 9598b9a0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -2287,6 +2287,8 @@ struct NumericUtils<f6_t>
static constexpr uint8_t positive_zero_mask = 0b000000;
static constexpr uint8_t negative_zero_mask = 0b100000;
static constexpr uint8_t set_sign_mask = 0b011111;
static constexpr uint8_t data_max_positive_normal_mask = 0b011111;
static constexpr uint8_t data_max_negative_normal_mask = 0b111111;
......@@ -2316,6 +2318,8 @@ struct NumericUtils<bf6_t>
static constexpr uint8_t positive_zero_mask = 0b000000;
static constexpr uint8_t negative_zero_mask = 0b100000;
static constexpr uint8_t set_sign_mask = 0b011111;
static constexpr uint8_t data_max_positive_normal_mask = 0b011111;
static constexpr uint8_t data_max_negative_normal_mask = 0b111111;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -54,6 +54,8 @@ struct e8m0_bexp_t
// strict IEEE compliance for NaN
return data == other.data && data != nan_mask;
}
__host__ __device__ constexpr bool is_nan() const { return data == nan_mask; }
};
namespace utils {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/mxfp_utils.hpp"
namespace ck::utils {
/**
* @brief Checks if an f6_t value is NaN based on the provided scale.
*
* For f6_t data, NaN cannot be represented directly. Instead, this function
* determines NaN by checking if the scale is set to a quiet NaN.
*
* @param scale The exponent scale factor (e8m0_bexp_t) used for f6_t.
* @param dataBytes The f6_t value to check (unused in this implementation).
* @return true if the scale indicates a NaN value, false otherwise.
*/
template <>
__host__ __device__ inline bool is_nan<f6_t>(e8m0_bexp_t const scale,
f6_t const dataBytes [[maybe_unused]])
{
// no need to check for data as it does not have NaN representation
return scale.is_nan();
}
/**
* @brief Checks if an bf6_t value is NaN based on the provided scale.
*
* For bf6_t data, NaN cannot be represented directly. Instead, this function
* determines NaN by checking if the scale is set to a quiet NaN.
*
* @param scale The exponent scale factor (e8m0_bexp_t) used for bf6_t.
* @param dataBytes The bf6_t value to check (unused in this implementation).
* @return true if the scale indicates a NaN value, false otherwise.
*/
template <>
__host__ __device__ inline bool is_nan<bf6_t>(e8m0_bexp_t const scale,
bf6_t const dataBytes [[maybe_unused]])
{
// no need to check for data as it does not have NaN representation
return scale.is_nan();
}
/**
* @brief Checks if an f6_t value is infinite.
*
* Because f6_t does not support infinite values, this function always returns false.
*
* @param scale The exponent scale factor (e8m0_bexp_t) used for f6_t.
* @param data The f6_t value to check.
* @return Always false, as infinity is not represented in f6_t.
*/
template <>
__host__ __device__ inline bool is_inf<f6_t>(e8m0_bexp_t const scale [[maybe_unused]],
f6_t const data [[maybe_unused]])
{
// no inf representation for fp6
return false;
}
/**
* @brief Checks if an bf6_t value is infinite.
*
* Because bf6_t does not support infinite values, this function always returns false.
*
* @param scale The exponent scale factor (e8m0_bexp_t) used for bf6_t.
* @param data The bf6_t value to check.
* @return Always false, as infinity is not represented in bf6_t.
*/
template <>
__host__ __device__ inline bool is_inf<bf6_t>(e8m0_bexp_t const scale [[maybe_unused]],
bf6_t const data [[maybe_unused]])
{
// no inf representation for bf6
return false;
}
/**
* @brief Checks whether an f6_t value is zero.
*
* If the specified f6_t is NaN, this function returns false.
* Otherwise, it masks out the sign bits and checks if the remaining bits
* are zero.
*
* @param scale The exponent scale factor (e8m0_bexp_t) used for f6_t.
* @param data The f6_t value to check.
* @return true if the value is zero; otherwise false.
*/
template <>
__host__ __device__ inline bool is_zero<f6_t>(e8m0_bexp_t const scale, f6_t const data)
{
if(is_nan<f6_t>(scale, data))
return false;
// no need to check for scale as it does not have a 0 representation
f6_t result = (data & 0b00111111) & NumericUtils<f6_t>::set_sign_mask;
return result == 0b0;
}
/**
* @brief Checks whether an bf6_t value is zero.
*
* If the specified bf6_t is NaN, this function returns false.
* Otherwise, it masks out the sign bits and checks if the remaining bits
* are zero.
*
* @param scale The exponent scale factor (e8m0_bexp_t) used for bf6_t.
* @param data The bf6_t value to check.
* @return true if the value is zero; otherwise false.
*/
template <>
__host__ __device__ inline bool is_zero<bf6_t>(e8m0_bexp_t const scale, bf6_t const data)
{
if(is_nan<bf6_t>(scale, data))
return false;
// no need to check for scale as it does not have a 0 representation
bf6_t result = (data & 0b00111111) & NumericUtils<bf6_t>::set_sign_mask;
return result == 0b0;
}
/**
* @brief Converts an f6_t value to a float based on an e8m0_bexp_t scale factor.
*
* Checks if the f6_t value is NaN or zero before performing the conversion.
* Applies the exponent from the scale to compute the final float result.
*
* @param scale The exponent scale factor (e8m0_bexp_t) used for f6_t.
* @param data The f6_t value to convert.
* @return The converted float value.
*/
template <>
__host__ __device__ inline float to_float<f6_t>(e8m0_bexp_t const scale, f6_t const data)
{
if(is_nan<f6_t>(scale, data))
return std::numeric_limits<float>::quiet_NaN();
if(is_zero<f6_t>(scale, data))
return 0.0f;
f6_t prepared_data = data & 0b00111111;
int scale_exp = get_exponent_value<e8m0_bexp_t>(scale);
return convert_to_float<f6_t>(prepared_data, scale_exp);
}
/**
* @brief Converts an bf6_t value to a float based on an e8m0_bexp_t scale factor.
*
* Checks if the bf6_t value is NaN or zero before performing the conversion.
* Applies the exponent from the scale to compute the final float result.
*
* @param scale The exponent scale factor (e8m0_bexp_t) used for bf6_t.
* @param data The bf6_t value to convert.
* @return The converted float value.
*/
template <>
__host__ __device__ inline float to_float<bf6_t>(e8m0_bexp_t const scale, bf6_t const data)
{
if(is_nan<bf6_t>(scale, data))
return std::numeric_limits<float>::quiet_NaN();
if(is_zero<bf6_t>(scale, data))
return 0.0f;
bf6_t prepared_data = data & 0b00111111;
int scale_exp = get_exponent_value<e8m0_bexp_t>(scale);
return convert_to_float<bf6_t>(prepared_data, scale_exp);
}
/**
* @brief Converts a float to f6_t with saturation.
*
* If the input is NaN or exceeds the representable range for f6_t, returns
* the corresponding max normal mask. Handles subnormal cases by returning
* zero with the appropriate sign.
*
* @param value The float value to be converted.
* @return The saturated f6_t value.
*/
template <>
__host__ __device__ inline f6_t sat_convert_to_type<f6_t>(float value)
{
cvt t;
t.value_float = value;
uint32_t sign = t.value_bitwise >> 31;
if(std::isnan(value))
{
return sign ? NumericUtils<f6_t>::data_max_negative_normal_mask
: NumericUtils<f6_t>::data_max_positive_normal_mask;
}
if(std::abs(value) > NumericLimits<f6_t>::Max()) // covers inf case as well
return sign ? NumericUtils<f6_t>::data_max_negative_normal_mask
: NumericUtils<f6_t>::data_max_positive_normal_mask;
f6_t res = convert_to_type<f6_t>(value);
if(std::abs(to_float<f6_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), res)) <
NumericLimits<f6_t>::DataMinSubnorm())
return sign ? NumericUtils<f6_t>::negative_zero_mask
: NumericUtils<f6_t>::positive_zero_mask;
return res;
}
/**
* @brief Converts a float to bf6_t with saturation.
*
* If the input is NaN or exceeds the representable range for bf6_t, returns
* the corresponding max normal mask. Handles subnormal cases by returning
* zero with the appropriate sign.
*
* @param value The float value to be converted.
* @return The saturated bf6_t value.
*/
template <>
__host__ __device__ inline bf6_t sat_convert_to_type<bf6_t>(float value)
{
cvt t;
t.value_float = value;
uint32_t sign = t.value_bitwise >> 31;
if(std::isnan(value))
{
return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
: NumericUtils<bf6_t>::data_max_positive_normal_mask;
}
if(std::abs(value) > NumericLimits<bf6_t>::Max()) // covers inf case as well
return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
: NumericUtils<bf6_t>::data_max_positive_normal_mask;
bf6_t res = convert_to_type<bf6_t>(value);
if(std::abs(to_float<bf6_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), res)) <
NumericLimits<bf6_t>::DataMinSubnorm())
return sign ? NumericUtils<bf6_t>::negative_zero_mask
: NumericUtils<bf6_t>::positive_zero_mask;
return res;
}
/**
* @brief Converts a float to f6_t with saturation and stochastic rounding.
*
* If the input is NaN or exceeds the representable range for f6_t, returns
* the corresponding max normal mask. Handles subnormal cases by returning
* zero with the appropriate sign.
*
* @param value The float value to be converted.
* @return The saturated f6_t value.
*/
template <>
__host__ __device__ inline f6_t sat_convert_to_type_sr<f6_t>(float value, uint32_t seed)
{
cvt t;
t.value_float = value;
uint32_t sign = t.value_bitwise >> 31;
if(std::isnan(value))
return sign ? NumericUtils<f6_t>::data_max_negative_normal_mask
: NumericUtils<f6_t>::data_max_positive_normal_mask;
if(std::abs(value) > NumericLimits<f6_t>::Max()) // covers inf case as well
return sign ? NumericUtils<f6_t>::data_max_negative_normal_mask
: NumericUtils<f6_t>::data_max_positive_normal_mask;
f6_t res = convert_to_type_sr<f6_t>(value, seed);
if(std::abs(to_float<f6_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), res)) <
NumericLimits<f6_t>::DataMinSubnorm())
return sign ? NumericUtils<f6_t>::negative_zero_mask
: NumericUtils<f6_t>::positive_zero_mask;
return res;
}
/**
* @brief Converts a float to f6_t with saturation and stochastic rounding.
*
* If the input is NaN or exceeds the representable range for f6_t, returns
* the corresponding max normal mask. Handles subnormal cases by returning
* zero with the appropriate sign.
*
* @param value The float value to be converted.
* @return The saturated f6_t value.
*/
template <>
__host__ __device__ inline bf6_t sat_convert_to_type_sr<bf6_t>(float value, uint32_t seed)
{
cvt t;
t.value_float = value;
uint32_t sign = t.value_bitwise >> 31;
if(std::isnan(value))
return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
: NumericUtils<bf6_t>::data_max_positive_normal_mask;
if(std::abs(value) > NumericLimits<bf6_t>::Max()) // covers inf case as well
return sign ? NumericUtils<bf6_t>::data_max_negative_normal_mask
: NumericUtils<bf6_t>::data_max_positive_normal_mask;
bf6_t res = convert_to_type_sr<bf6_t>(value, seed);
if(std::abs(to_float<bf6_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), res)) <
NumericLimits<bf6_t>::DataMinSubnorm())
return sign ? NumericUtils<bf6_t>::negative_zero_mask
: NumericUtils<bf6_t>::positive_zero_mask;
return res;
}
} // namespace ck::utils
// SPDX-License-Identifier: MIT
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/type_convert.hpp"
......@@ -283,4 +286,76 @@ inline __host__ __device__ f4x32_t scaled_type_convert<f4x32_t, float32_t>(e8m0_
#endif
}
/**
* @brief Converts a 6-bit floating-point value (f6_t) to a 32-bit float,
* applying the specified scaling factor.
*
* @param scale The exponent scale factor (e8m0_bexp_t) used for f6_t.
* @param x The f6_t value to be converted.
* @return The converted 32-bit float representation of the input.
*/
template <>
inline __host__ __device__ float scaled_type_convert<float, f6_t>(e8m0_bexp_t scale, f6_t x)
{
// currently there is no native conversion instruction
return utils::to_float<f6_t>(scale, x);
}
/**
* @brief Converts a 6-bit floating-point value (bf6_t) to a 32-bit float,
* applying the specified scaling factor.
*
* @param scale The exponent scale factor (e8m0_bexp_t) used for bf6_t.
* @param x The bf6_t value to be converted.
* @return The converted 32-bit float representation of the input.
*/
template <>
inline __host__ __device__ float scaled_type_convert<float, bf6_t>(e8m0_bexp_t scale, bf6_t x)
{
// currently there is no native conversion instruction
return utils::to_float<bf6_t>(scale, x);
}
/**
* @brief Converts a 32-bit float to a 6-bit floating-point value (f6_t), applying the specified
* scale.
*
* Depending on whether CK_USE_SR_F6_CONVERSION is defined, it uses either stochastic rounding
* (f6_convert_sr) or round-to-nearest-even (f6_convert_rne).
*
* @param scale The exponent scale factor (e8m0_bexp_t) used for f6_t.
* @param x The float value to convert.
* @return The converted 6-bit floating-point value (f6_t).
*/
template <>
inline __host__ __device__ f6_t scaled_type_convert<f6_t, float>(e8m0_bexp_t scale, float x)
{
#if CK_USE_SR_F6_CONVERSION
return f6_convert_sr(x, type_convert<float>(scale));
#else
return f6_convert_rne(x, type_convert<float>(scale));
#endif
}
/**
* @brief Converts a 32-bit float to a 6-bit floating-point value (bf6_t), applying the specified
* scale.
*
* Depending on whether CK_USE_SR_F6_CONVERSION is defined, it uses either stochastic rounding
* (bf6_convert_sr) or round-to-nearest-even (bf6_convert_rne).
*
* @param scale The exponent scale factor (e8m0_bexp_t) used for bf6_t.
* @param x The float value to convert.
* @return The converted 6-bit floating-point value (bf6_t).
*/
template <>
inline __host__ __device__ bf6_t scaled_type_convert<bf6_t, float>(e8m0_bexp_t scale, float x)
{
#if CK_USE_SR_F6_CONVERSION
return bf6_convert_sr(x, type_convert<float>(scale));
#else
return bf6_convert_rne(x, type_convert<float>(scale));
#endif
}
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/f8_utils.hpp"
#include "ck/utility/mxf4_utils.hpp"
#include "ck/utility/mxf6_utils.hpp"
#include "ck/utility/random_gen.hpp"
#include "ck/utility/array.hpp"
......@@ -1340,6 +1341,147 @@ inline __host__ __device__ float32_t type_convert<float32_t, f4x32_t>(f4x32_t x)
#endif
}
/**
* @brief Converts a float to a 6-bit float type (f6_t) using round-to-nearest-even.
*
* Divides the input by the specified scale, then saturates and converts it
* to the 6-bit floating-point format (f6_t).
*
* @param x The input float value.
* @param scale A scaling factor applied to `x` before conversion.
* @return The converted f6_t value.
*/
inline __host__ __device__ f6_t f6_convert_rne(float x, float scale = 1.0f)
{
// currently there is no native conversion instruction
return utils::sat_convert_to_type<f6_t>(x / scale);
}
/**
* @brief Converts a float to the 6-bit floating-point type (f6_t) using stochastic rounding.
*
* Divides the input by the specified scale, then performs saturation and conversion
* to f6_t based on a pseudo-randomly generated seed.
*
* @param x The input float value.
* @param scale A scaling factor applied to `x` before conversion.
* @return The converted f6_t value.
*/
inline __host__ __device__ f6_t f6_convert_sr(float x, float scale = 1.0f)
{
constexpr int seed = 1254739;
uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
// currently there is no native conversion instruction
return utils::sat_convert_to_type_sr<f6_t>(x / scale, rng);
}
/**
* @brief Specializes the type conversion template for converting a float into the 6-bit float type
* (f6_t).
*
* Depending on the CK_USE_SR_F4_CONVERSION flag,
* the conversion uses stochastic rounding
* or round-to-nearest-even.
*
* @param x Input float value to be converted.
* @return The converted f6_t value.
*/
template <>
inline __host__ __device__ f6_t type_convert<f6_t, float>(float x)
{
#if CK_USE_SR_F4_CONVERSION
return f6_convert_sr(x);
#else
return f6_convert_rne(x);
#endif
}
/**
* @brief Specializes the type conversion template for converting the 6-bit float type (f6_t) to
* float.
*
* Interprets an f6_t value as a float using the default scale factor of 1.
*
* @param x The 6-bit float (f6_t) value to be converted.
* @return The corresponding float representation.
*/
template <>
inline __host__ __device__ float type_convert<float, f6_t>(f6_t x)
{
// currently there is no native conversion instruction
return utils::to_float<f6_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), x);
}
/**
* @brief Converts a float to the 6-bit BF6 type using round-to-nearest-even.
*
* Divides the input by the specified scale, then saturates and converts
* it to a 6-bit BF6 floating-point format.
*
* @param x The float value to be converted.
* @param scale The scaling factor applied to the input before conversion.
* @return The converted bf6_t value.
*/
inline __host__ __device__ bf6_t bf6_convert_rne(float x, float scale = 1.0f)
{
// currently there is no native conversion instruction
return utils::sat_convert_to_type<bf6_t>(x / scale);
}
/**
* @brief Converts a float to the 6-bit BF6 type using stochastic rounding.
*
* Divides the input by the specified scale,
* and converts the result to a 6-bit BF6 floating-point
* format with stochastic rounding.
*
* @param x The float value to be converted.
* @param scale The scaling factor applied to the input before conversion.
* @return The converted bf6_t value.
*/
inline __host__ __device__ bf6_t bf6_convert_sr(float x, float scale = 1.0f)
{
constexpr int seed = 1254739;
uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
// currently there is no native conversion instruction
return utils::sat_convert_to_type_sr<bf6_t>(x / scale, rng);
}
/**
* @brief Specializes float-to-bf6_t conversion.
*
* Uses stochastic rounding if CK_USE_SR_F4_CONVERSION is defined,
* otherwise uses round-to-nearest-even.
*
* @param x Input float value to convert.
* @return Converted bf6_t value.
*/
template <>
inline __host__ __device__ bf6_t type_convert<bf6_t, float>(float x)
{
#if CK_USE_SR_F4_CONVERSION
return bf6_convert_sr(x);
#else
return bf6_convert_rne(x);
#endif
}
/**
* @brief Specializes the type conversion template for converting a bf6_t value to float.
*
* Interprets the bf6_t value using the default scale factor of 1 and returns
* its floating-point representation.
*
* @param x The bf6_t value to convert.
* @return The float representation of the given bf6_t value.
*/
template <>
inline __host__ __device__ float type_convert<float, bf6_t>(bf6_t x)
{
// currently there is no native conversion instruction
return utils::to_float<bf6_t>(NumericLimits<e8m0_bexp_t>::Binary_1(), x);
}
template <typename Y, typename X, std::size_t NumElems>
inline __host__ __device__ void array_convert(std::array<Y, NumElems>& y,
const std::array<X, NumElems>& x)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "ck/utility/data_type.hpp"
#include "ck/utility/type_convert.hpp"
#include "ck/utility/scaled_type_convert.hpp"
using ck::bf6_convert_rne;
using ck::bf6_convert_sr;
using ck::bf6_t;
using ck::e8m0_bexp_t;
using ck::Number;
......@@ -21,3 +23,196 @@ TEST(BF6, NumericLimits)
EXPECT_EQ(ck::NumericLimits<bf6_t>::MinSubnorm(), bf6_t(0b000001));
EXPECT_EQ(ck::NumericLimits<bf6_t>::MaxSubnorm(), bf6_t(0b000011));
}
TEST(BF6, ConvertFP32Nearest)
{
// set maximum bf6 value
float max_bf6 = 28.0f;
// convert 0 float to bf6 and back, check if holds
ASSERT_NEAR(0.0f, type_convert<float>(bf6_convert_rne(0.0f)), 0.0f);
// convert max_bf6 to float and check if equal to max_bf6
ASSERT_NEAR(max_bf6, type_convert<float>(bf6_convert_rne(max_bf6)), 0.0f);
// convert maximal float to bf6 and back, check if clipped to max_bf6
ASSERT_NEAR(
max_bf6, type_convert<float>(bf6_convert_rne(std::numeric_limits<float>::max())), 0.0f);
// convert float Inf to bf6 and back, check if clipped to max_bf6
ASSERT_NEAR(max_bf6,
type_convert<float>(bf6_convert_rne(std::numeric_limits<float>::infinity())),
0.0f);
// convert float value less than bf6 subnorm to bf6 and back, check if equal to 0.0
float less_than_subnorm = 0.0625f;
ASSERT_NEAR(0.0f, type_convert<float>(f6_convert_rne(less_than_subnorm)), 0.0f);
// convert float NaN to bf6 and back, check if clipped to max_bf6
ASSERT_NEAR(max_bf6,
type_convert<float>(f6_convert_rne(std::numeric_limits<float>::quiet_NaN())),
0.0f);
// positive norm float value to bf6 and back, check if holds
float pos_float = 0.25f;
ASSERT_NEAR(pos_float, type_convert<float>(bf6_convert_rne(pos_float)), 0.0f);
// negative norm float value to bf6 and back, check if holds
float neg_float = -0.5f;
ASSERT_NEAR(neg_float, type_convert<float>(bf6_convert_rne(neg_float)), 0.0f);
// positive subnorm float value to bf6 and back, check if holds
pos_float = 0.1875f;
ASSERT_NEAR(pos_float, type_convert<float>(bf6_convert_rne(pos_float)), 0.0f);
// negative subnorm float value to bf6 and back, check if holds
neg_float = -0.0625f;
ASSERT_NEAR(neg_float, type_convert<float>(bf6_convert_rne(neg_float)), 0.0f);
}
TEST(BF6, ConvertFP32Stochastic)
{
// fix the tolerance value
float abs_tol = 1e-6;
// set maximum bf6 value
float max_bf6 = 28.0f;
// convert 0 float to bf6 and back, check if holds
ASSERT_NEAR(0.0f, type_convert<float>(bf6_convert_sr(0.0f)), abs_tol);
// convert maximal bf6_t to float and check if equal to max_bf6
ASSERT_NEAR(max_bf6, type_convert<float>(bf6_convert_sr(max_bf6)), abs_tol);
// convert maximal float to bf6 and back, check if clipped to max_bf6
ASSERT_NEAR(
max_bf6, type_convert<float>(bf6_convert_sr(std::numeric_limits<float>::max())), abs_tol);
// convert float Inf to bf6 and back, check if clipped to max_bf6
ASSERT_NEAR(max_bf6,
type_convert<float>(bf6_convert_rne(std::numeric_limits<float>::infinity())),
0.0f);
// convert float NaN to bf6 and back, check if clipped to max_bf6
ASSERT_NEAR(max_bf6,
type_convert<float>(f6_convert_rne(std::numeric_limits<float>::quiet_NaN())),
0.0f);
// positive norm float value to bf6 and back, check if holds
float pos_float = 0.25f;
ASSERT_NEAR(pos_float, type_convert<float>(bf6_convert_sr(pos_float)), abs_tol);
// negative norm float value to bf6 and back, check if holds
float neg_float = -0.5f;
ASSERT_NEAR(neg_float, type_convert<float>(bf6_convert_sr(neg_float)), abs_tol);
// positive subnorm float value to bf6 and back, check if holds
pos_float = 0.1875f;
ASSERT_NEAR(pos_float, type_convert<float>(bf6_convert_sr(pos_float)), abs_tol);
// negative subnorm float value to bf6 and back, check if holds
neg_float = -0.0625f;
ASSERT_NEAR(neg_float, type_convert<float>(bf6_convert_sr(neg_float)), abs_tol);
}
TEST(BF6, ScaledConvertFP32Nearest)
{
// set maximum scale
float max_scale = type_convert<float>(ck::NumericLimits<e8m0_bexp_t>::Max()); // 0xFE -> float
// set minimum scale
float min_scale = type_convert<float>(ck::NumericLimits<e8m0_bexp_t>::Min()); // 0x00 -> float
// set arbitrary scale to 256.0
float test_scale = 256.0f; // 0b10000111
// convert 0 float to bf6 and back with maximal scale, check if holds
ASSERT_NEAR(
0.0f, scaled_type_convert<float>(e8m0_bexp_t(max_scale), bf6_convert_rne(0.0f)), 0.0f);
// convert 0 float to bf6 and back with minimal scale, check if holds
ASSERT_NEAR(
0.0f, scaled_type_convert<float>(e8m0_bexp_t(min_scale), bf6_convert_rne(0.0f)), 0.0f);
// positive norm float value to bf6 and back with various scales, check if holds
float pos_float = 0.25f;
ASSERT_NEAR(pos_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), bf6_convert_rne(pos_float)),
0.0f);
ASSERT_NEAR(pos_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), bf6_convert_rne(pos_float)),
0.0f);
ASSERT_NEAR(pos_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), bf6_convert_rne(pos_float)),
0.0f);
// negative norm float value to bf6 and back with various scales, check if holds
float neg_float = -0.5f;
ASSERT_NEAR(neg_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), bf6_convert_rne(neg_float)),
0.0f);
ASSERT_NEAR(neg_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), bf6_convert_rne(neg_float)),
0.0f);
ASSERT_NEAR(neg_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), bf6_convert_rne(neg_float)),
0.0f);
// positive subnorm float value to bf6 and back with various scales, check if holds
pos_float = 0.1875f;
ASSERT_NEAR(pos_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), bf6_convert_rne(pos_float)),
0.0f);
ASSERT_NEAR(pos_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), bf6_convert_rne(pos_float)),
0.0f);
ASSERT_NEAR(pos_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), bf6_convert_rne(pos_float)),
0.0f);
// negative subnorm float value to bf6 and back with various scales, check if holds
neg_float = -0.0625f;
ASSERT_NEAR(neg_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), bf6_convert_rne(neg_float)),
0.0f);
ASSERT_NEAR(neg_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), bf6_convert_rne(neg_float)),
0.0f);
ASSERT_NEAR(neg_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), bf6_convert_rne(neg_float)),
0.0f);
}
TEST(BF6, ScaledConvertFP32Stochastic)
{
// fix the tolerance value
float abs_tol = 1e-6;
// set maximum scale
float max_scale = type_convert<float>(ck::NumericLimits<e8m0_bexp_t>::Max()); // 0xFE -> float
// set minimum scale
float min_scale = type_convert<float>(ck::NumericLimits<e8m0_bexp_t>::Min()); // 0x00 -> float
// set arbitrary scale to 256.0
float test_scale = 256.0f; // 0b10000111
// convert 0 float to bf6 and back with maximal scale, check if holds
ASSERT_NEAR(
0.0f, scaled_type_convert<float>(e8m0_bexp_t(max_scale), bf6_convert_sr(0.0f)), abs_tol);
// convert 0 float to bf6 and back with minimal scale, check if holds
ASSERT_NEAR(
0.0f, scaled_type_convert<float>(e8m0_bexp_t(min_scale), bf6_convert_sr(0.0f)), abs_tol);
// positive norm float value to bf6 and back with various scales, check if holds
float pos_float = 0.25f;
ASSERT_NEAR(pos_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), bf6_convert_sr(pos_float)),
abs_tol);
ASSERT_NEAR(pos_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), bf6_convert_sr(pos_float)),
abs_tol);
ASSERT_NEAR(pos_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), bf6_convert_sr(pos_float)),
abs_tol);
// negative norm float value to bf6 and back with various scales, check if holds
float neg_float = -0.5f;
ASSERT_NEAR(neg_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), bf6_convert_sr(neg_float)),
abs_tol);
ASSERT_NEAR(neg_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), bf6_convert_sr(neg_float)),
abs_tol);
ASSERT_NEAR(neg_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), bf6_convert_sr(neg_float)),
abs_tol);
// positive subnorm float value to bf6 and back with various scales, check if holds
pos_float = 0.1875f;
ASSERT_NEAR(pos_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), bf6_convert_sr(pos_float)),
abs_tol);
ASSERT_NEAR(pos_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), bf6_convert_sr(pos_float)),
abs_tol);
ASSERT_NEAR(pos_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), bf6_convert_sr(pos_float)),
abs_tol);
// negative subnorm float value to bf6 and back with various scales, check if holds
neg_float = -0.0625f;
ASSERT_NEAR(neg_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), bf6_convert_sr(neg_float)),
abs_tol);
ASSERT_NEAR(neg_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), bf6_convert_sr(neg_float)),
abs_tol);
ASSERT_NEAR(neg_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), bf6_convert_sr(neg_float)),
abs_tol);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "ck/utility/data_type.hpp"
......@@ -83,8 +83,6 @@ TEST(FP4, ScaledConvertFP32Nearest)
{
// fix the tolerance value
float abs_tol = 1e-6;
// set maximum fp4 value
float max_fp4 = 6.0f;
// set maximum scale
float max_scale = type_convert<float>(ck::NumericLimits<e8m0_bexp_t>::Max()); // 0xFE -> float
// set minimum scale
......@@ -97,10 +95,6 @@ TEST(FP4, ScaledConvertFP32Nearest)
// convert 0 float to fp4 and back with minimal scale, check if holds
ASSERT_NEAR(
0.0f, scaled_type_convert<float>(e8m0_bexp_t(min_scale), f4_convert_rne(0.0f)), abs_tol);
// convert maximal f4_t with minimal scale to float and check if equal to minimal float
ASSERT_NEAR(ck::NumericLimits<float>::Min(),
scaled_type_convert<float>(e8m0_bexp_t(min_scale), f4_convert_rne(max_fp4)),
abs_tol);
// positive norm float value to fp4 and back with various scales, check if holds
float pos_float = 1.0f;
ASSERT_NEAR(pos_float * test_scale,
......@@ -151,8 +145,6 @@ TEST(FP4, ScaledConvertFP32Stochastic)
{
// fix the tolerance value
float abs_tol = 1e-6;
// set maximum fp4 value
float max_fp4 = 6.0f;
// set maximum scale
float max_scale = type_convert<float>(ck::NumericLimits<e8m0_bexp_t>::Max()); // 0xFE -> float
// set minimum scale
......@@ -165,10 +157,6 @@ TEST(FP4, ScaledConvertFP32Stochastic)
// convert 0 float to fp4 and back with minimal scale, check if holds
ASSERT_NEAR(
0.0f, scaled_type_convert<float>(e8m0_bexp_t(min_scale), f4_convert_sr(0.0f)), abs_tol);
// convert maximal f4_t with minimal scale to float and check if equal to minimal float
ASSERT_NEAR(ck::NumericLimits<float>::Min(),
scaled_type_convert<float>(e8m0_bexp_t(min_scale), f4_convert_sr(max_fp4)),
abs_tol);
// positive norm float value to fp4 and back with various scales, check if holds
float pos_float = 1.0f;
ASSERT_NEAR(pos_float * test_scale,
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "ck/utility/data_type.hpp"
......@@ -7,6 +7,8 @@
#include "ck/utility/scaled_type_convert.hpp"
using ck::e8m0_bexp_t;
using ck::f6_convert_rne;
using ck::f6_convert_sr;
using ck::f6_t;
using ck::Number;
using ck::scaled_type_convert;
......@@ -21,3 +23,195 @@ TEST(FP6, NumericLimits)
EXPECT_EQ(ck::NumericLimits<f6_t>::MinSubnorm(), f6_t(0b000001));
EXPECT_EQ(ck::NumericLimits<f6_t>::MaxSubnorm(), f6_t(0b000111));
}
TEST(FP6, ConvertFP32Nearest)
{
// set maximum fp6 value
float max_fp6 = 7.5f;
// convert 0 float to fp6 and back, check if holds
ASSERT_NEAR(0.0f, type_convert<float>(f6_convert_rne(0.0f)), 0.0f);
// convert maximal f6_t to float and check if equal to max_fp6
ASSERT_NEAR(max_fp6, type_convert<float>(f6_convert_rne(max_fp6)), 0.0f);
// convert maximal float to fp6 and back, check if clipped to max_fp6
ASSERT_NEAR(
max_fp6, type_convert<float>(f6_convert_rne(std::numeric_limits<float>::max())), 0.0f);
// convert float Inf to fp6 and back, check if clipped to max_fp6
ASSERT_NEAR(
max_fp6, type_convert<float>(f6_convert_rne(std::numeric_limits<float>::infinity())), 0.0f);
// convert float value less than fp6 subnorm to fp6 and back, check if equal to 0.0
float less_than_subnorm = 0.0625f;
ASSERT_NEAR(0.0f, type_convert<float>(f6_convert_rne(less_than_subnorm)), 0.0f);
// convert float NaN to fp6 and back, check if clipped to max_fp6
ASSERT_NEAR(max_fp6,
type_convert<float>(f6_convert_rne(std::numeric_limits<float>::quiet_NaN())),
0.0f);
// positive norm float value to fp6 and back, check if holds
float pos_float = 1.0f;
ASSERT_NEAR(pos_float, type_convert<float>(f6_convert_rne(pos_float)), 0.0f);
// negative norm float value to fp6 and back, check if holds
float neg_float = -1.5f;
ASSERT_NEAR(neg_float, type_convert<float>(f6_convert_rne(neg_float)), 0.0f);
// positive subnorm float value to fp6 and back, check if holds
pos_float = 0.125f;
ASSERT_NEAR(pos_float, type_convert<float>(f6_convert_rne(pos_float)), 0.0f);
// negative subnorm float value to fp6 and back, check if holds
neg_float = -0.25f;
ASSERT_NEAR(neg_float, type_convert<float>(f6_convert_rne(neg_float)), 0.0f);
}
TEST(FP6, ConvertFP32Stochastic)
{
// fix the tolerance value
float abs_tol = 1e-6;
// set maximum fp6 value
float max_fp6 = 7.5f;
// convert 0 float to fp6 and back, check if holds
ASSERT_NEAR(0.0f, type_convert<float>(f6_convert_sr(0.0f)), abs_tol);
// convert maximal f6_t to float and check if equal to max_fp6
ASSERT_NEAR(max_fp6, type_convert<float>(f6_convert_sr(max_fp6)), abs_tol);
// convert maximal float to fp6 and back, check if clipped to max_fp6
ASSERT_NEAR(
max_fp6, type_convert<float>(f6_convert_sr(std::numeric_limits<float>::max())), abs_tol);
// convert float Inf to fp6 and back, check if clipped to max_fp6
ASSERT_NEAR(max_fp6,
type_convert<float>(f6_convert_sr(std::numeric_limits<float>::infinity())),
abs_tol);
// convert float NaN to fp6 and back, check if clipped to max_fp6
ASSERT_NEAR(max_fp6,
type_convert<float>(f6_convert_sr(std::numeric_limits<float>::quiet_NaN())),
abs_tol);
// positive norm float value to fp6 and back, check if holds
float pos_float = 1.0f;
ASSERT_NEAR(pos_float, type_convert<float>(f6_convert_sr(pos_float)), abs_tol);
// negative norm float value to fp6 and back, check if holds
float neg_float = -1.5f;
ASSERT_NEAR(neg_float, type_convert<float>(f6_convert_sr(neg_float)), abs_tol);
// positive subnorm float value to fp6 and back, check if holds
pos_float = 0.125f;
ASSERT_NEAR(pos_float, type_convert<float>(f6_convert_sr(pos_float)), abs_tol);
// negative subnorm float value to fp6 and back, check if holds
neg_float = -0.25f;
ASSERT_NEAR(neg_float, type_convert<float>(f6_convert_sr(neg_float)), abs_tol);
}
TEST(FP6, ScaledConvertFP32Nearest)
{
// set maximum scale
float max_scale = type_convert<float>(ck::NumericLimits<e8m0_bexp_t>::Max()); // 0xFE -> float
// set minimum scale
float min_scale = type_convert<float>(ck::NumericLimits<e8m0_bexp_t>::Min()); // 0x00 -> float
// set arbitrary scale to 256.0
float test_scale = 256.0f; // 0b10000111
// convert 0 float to fp6 and back with maximal scale, check if holds
ASSERT_NEAR(
0.0f, scaled_type_convert<float>(e8m0_bexp_t(max_scale), f6_convert_rne(0.0f)), 0.0f);
// convert 0 float to fp6 and back with minimal scale, check if holds
ASSERT_NEAR(
0.0f, scaled_type_convert<float>(e8m0_bexp_t(min_scale), f6_convert_rne(0.0f)), 0.0f);
// positive norm float value to fp6 and back with various scales, check if holds
float pos_float = 1.0f;
ASSERT_NEAR(pos_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), f6_convert_rne(pos_float)),
0.0f);
ASSERT_NEAR(pos_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), f6_convert_rne(pos_float)),
0.0f);
ASSERT_NEAR(pos_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), f6_convert_rne(pos_float)),
0.0f);
// negative norm float value to fp6 and back with various scales, check if holds
float neg_float = -1.5f;
ASSERT_NEAR(neg_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), f6_convert_rne(neg_float)),
0.0f);
ASSERT_NEAR(neg_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), f6_convert_rne(neg_float)),
0.0f);
ASSERT_NEAR(neg_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), f6_convert_rne(neg_float)),
0.0f);
// positive subnorm float value to fp6 and back with various scales, check if holds
pos_float = 0.125f;
ASSERT_NEAR(pos_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), f6_convert_rne(pos_float)),
0.0f);
ASSERT_NEAR(pos_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), f6_convert_rne(pos_float)),
0.0f);
ASSERT_NEAR(pos_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), f6_convert_rne(pos_float)),
0.0f);
// negative subnorm float value to fp6 and back with various scales, check if holds
neg_float = -0.25f;
ASSERT_NEAR(neg_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), f6_convert_rne(neg_float)),
0.0f);
ASSERT_NEAR(neg_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), f6_convert_rne(neg_float)),
0.0f);
ASSERT_NEAR(neg_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), f6_convert_rne(neg_float)),
0.0f);
}
TEST(FP6, ScaledConvertFP32Stochastic)
{
// fix the tolerance value
float abs_tol = 1e-6;
// set maximum scale
float max_scale = type_convert<float>(ck::NumericLimits<e8m0_bexp_t>::Max()); // 0xFE -> float
// set minimum scale
float min_scale = type_convert<float>(ck::NumericLimits<e8m0_bexp_t>::Min()); // 0x00 -> float
// set arbitrary scale to 256.0
float test_scale = 256.0f; // 0b10000111
// convert 0 float to fp6 and back with maximal scale, check if holds
ASSERT_NEAR(
0.0f, scaled_type_convert<float>(e8m0_bexp_t(max_scale), f6_convert_sr(0.0f)), abs_tol);
// convert 0 float to fp6 and back with minimal scale, check if holds
ASSERT_NEAR(
0.0f, scaled_type_convert<float>(e8m0_bexp_t(min_scale), f6_convert_sr(0.0f)), abs_tol);
// positive norm float value to fp6 and back with various scales, check if holds
float pos_float = 1.0f;
ASSERT_NEAR(pos_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), f6_convert_sr(pos_float)),
abs_tol);
ASSERT_NEAR(pos_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), f6_convert_sr(pos_float)),
abs_tol);
ASSERT_NEAR(pos_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), f6_convert_sr(pos_float)),
abs_tol);
// negative norm float value to fp6 and back with various scales, check if holds
float neg_float = -1.5f;
ASSERT_NEAR(neg_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), f6_convert_sr(neg_float)),
abs_tol);
ASSERT_NEAR(neg_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), f6_convert_sr(neg_float)),
abs_tol);
ASSERT_NEAR(neg_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), f6_convert_sr(neg_float)),
abs_tol);
// positive subnorm float value to fp6 and back with various scales, check if holds
pos_float = 0.125f;
ASSERT_NEAR(pos_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), f6_convert_sr(pos_float)),
abs_tol);
ASSERT_NEAR(pos_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), f6_convert_sr(pos_float)),
abs_tol);
ASSERT_NEAR(pos_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), f6_convert_sr(pos_float)),
abs_tol);
// negative subnorm float value to fp6 and back with various scales, check if holds
neg_float = -0.25f;
ASSERT_NEAR(neg_float * test_scale,
scaled_type_convert<float>(e8m0_bexp_t(test_scale), f6_convert_sr(neg_float)),
abs_tol);
ASSERT_NEAR(neg_float * max_scale,
scaled_type_convert<float>(e8m0_bexp_t(max_scale), f6_convert_sr(neg_float)),
abs_tol);
ASSERT_NEAR(neg_float * min_scale,
scaled_type_convert<float>(e8m0_bexp_t(min_scale), f6_convert_sr(neg_float)),
abs_tol);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment