"tests/pipelines/test_pipelines.py" did not exist on "a7ca03aa85f94574f06576d2155b3ec061fe8d63"
Commit 1e55b6f6 authored by Rostyslav Geyyer's avatar Rostyslav Geyyer
Browse files

Fix even more gfx950 conversions

parent 33f4f75b
...@@ -991,8 +991,14 @@ template <> ...@@ -991,8 +991,14 @@ template <>
inline __host__ __device__ float type_convert<float, f4_t>(f4_t x) inline __host__ __device__ float type_convert<float, f4_t>(f4_t x)
{ {
#if defined(__gfx950__) #if defined(__gfx950__)
float scale = 1.0f; union
return __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(x, scale, 0) {
float float_array[2];
float2_t float2_array;
} float_values{};
float scale = 1.0f;
float_values.float2_array = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(x, scale, 0);
return float_values.float_array[0];
#else #else
return utils::to_float<f4_t>(NumericLimits<e8m0_scale_t>::Binary_1(), x); return utils::to_float<f4_t>(NumericLimits<e8m0_scale_t>::Binary_1(), x);
#endif #endif
...@@ -1240,8 +1246,14 @@ template <> ...@@ -1240,8 +1246,14 @@ template <>
inline __host__ __device__ float scaled_type_convert<float, f4_t>(e8m0_scale_t scale, f4_t x) inline __host__ __device__ float scaled_type_convert<float, f4_t>(e8m0_scale_t scale, f4_t x)
{ {
#if defined(__gfx950__) #if defined(__gfx950__)
return __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(x, type_convert<float>(scale), 0) union
.template AsType<float>()(Number<0>{}); {
float float_array[2];
float2_t float2_array;
} float_values{};
float_values.float2_array =
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4(x, type_convert<float>(scale), 0);
return float_values.float_array[0];
#else #else
return utils::to_float<f4_t>(scale, x); return utils::to_float<f4_t>(scale, x);
#endif #endif
...@@ -1253,7 +1265,13 @@ inline __host__ __device__ float2_t scaled_type_convert<float2_t, f4x2_t>(e8m0_s ...@@ -1253,7 +1265,13 @@ inline __host__ __device__ float2_t scaled_type_convert<float2_t, f4x2_t>(e8m0_s
f4x2_t x) f4x2_t x)
{ {
#if defined(__gfx950__) #if defined(__gfx950__)
return __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(x, type_convert<float>(scale), 0); union
{
uint32_t bitwise;
f4x2_t f4x2_array[4];
} value{};
value.f4x2_array[0] = x;
return __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.bitwise, type_convert<float>(scale), 0);
#else #else
float2_t ret{utils::to_float<f4_t>(scale, x.unpack(1)), float2_t ret{utils::to_float<f4_t>(scale, x.unpack(1)),
utils::to_float<f4_t>(scale, x.unpack(0))}; utils::to_float<f4_t>(scale, x.unpack(0))};
...@@ -1272,58 +1290,107 @@ inline __host__ __device__ float32_t scaled_type_convert<float32_t, f4x32_t>(e8m ...@@ -1272,58 +1290,107 @@ inline __host__ __device__ float32_t scaled_type_convert<float32_t, f4x32_t>(e8m
f4x32_t f4x32_array; f4x32_t f4x32_array;
f4x2_t fp4x2[16]; f4x2_t fp4x2[16];
} value{x}; } value{x};
union
{
uint32_t bitwise;
f4x2_t f4x2_array[4];
} bitwise_value{};
float2_t op; float2_t op;
float32_t ret; float32_t ret;
// TODO: pack in a loop // TODO: pack in a loop
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[0], type_convert<float>(scale), 0); bitwise_value.f4x2_array[0] = value.fp4x2[0];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[0] = op[0]; ret[0] = op[0];
ret[1] = op[1]; ret[1] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[1], type_convert<float>(scale), 0);
bitwise_value.f4x2_array[0] = value.fp4x2[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[2] = op[0]; ret[2] = op[0];
ret[3] = op[1]; ret[3] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[2], type_convert<float>(scale), 0);
bitwise_value.f4x2_array[0] = value.fp4x2[2];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[4] = op[0]; ret[4] = op[0];
ret[5] = op[1]; ret[5] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[3], type_convert<float>(scale), 0);
bitwise_value.f4x2_array[0] = value.fp4x2[3];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[6] = op[0]; ret[6] = op[0];
ret[7] = op[1]; ret[7] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[4], type_convert<float>(scale), 0); bitwise_value.f4x2_array[0] = value.fp4x2[4];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[8] = op[0]; ret[8] = op[0];
ret[9] = op[1]; ret[9] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[5], type_convert<float>(scale), 0);
bitwise_value.f4x2_array[0] = value.fp4x2[5];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[10] = op[0]; ret[10] = op[0];
ret[11] = op[1]; ret[11] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[6], type_convert<float>(scale), 0);
bitwise_value.f4x2_array[0] = value.fp4x2[6];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[12] = op[0]; ret[12] = op[0];
ret[13] = op[1]; ret[13] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[7], type_convert<float>(scale), 0);
bitwise_value.f4x2_array[0] = value.fp4x2[7];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[14] = op[0]; ret[14] = op[0];
ret[15] = op[1]; ret[15] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[8], type_convert<float>(scale), 0); bitwise_value.f4x2_array[0] = value.fp4x2[8];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[16] = op[0]; ret[16] = op[0];
ret[17] = op[1]; ret[17] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[9], type_convert<float>(scale), 0);
bitwise_value.f4x2_array[0] = value.fp4x2[9];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[18] = op[0]; ret[18] = op[0];
ret[19] = op[1]; ret[19] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[10], type_convert<float>(scale), 0);
bitwise_value.f4x2_array[0] = value.fp4x2[10];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[20] = op[0]; ret[20] = op[0];
ret[21] = op[1]; ret[21] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[11], type_convert<float>(scale), 0);
bitwise_value.f4x2_array[0] = value.fp4x2[11];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[22] = op[0]; ret[22] = op[0];
ret[23] = op[1]; ret[23] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[12], type_convert<float>(scale), 0); bitwise_value.f4x2_array[0] = value.fp4x2[12];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[24] = op[0]; ret[24] = op[0];
ret[25] = op[1]; ret[25] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[13], type_convert<float>(scale), 0);
bitwise_value.f4x2_array[0] = value.fp4x2[13];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[26] = op[0]; ret[26] = op[0];
ret[27] = op[1]; ret[27] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[14], type_convert<float>(scale), 0);
bitwise_value.f4x2_array[0] = value.fp4x2[14];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[28] = op[0]; ret[28] = op[0];
ret[29] = op[1]; ret[29] = op[1];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[15], type_convert<float>(scale), 0);
bitwise_value.f4x2_array[0] = value.fp4x2[15];
op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(
bitwise_value.bitwise, type_convert<float>(scale), 0);
ret[30] = op[0]; ret[30] = op[0];
ret[31] = op[1]; ret[31] = op[1];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment