Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
1e55b6f6
"tests/pipelines/test_pipelines.py" did not exist on "a7ca03aa85f94574f06576d2155b3ec061fe8d63"
Commit
1e55b6f6
authored
Nov 21, 2024
by
Rostyslav Geyyer
Browse files
Fix even more gfx950 conversions
parent
33f4f75b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
88 additions
and
21 deletions
+88
-21
include/ck/utility/type_convert.hpp
include/ck/utility/type_convert.hpp
+88
-21
No files found.
include/ck/utility/type_convert.hpp
View file @
1e55b6f6
...
@@ -991,8 +991,14 @@ template <>
...
@@ -991,8 +991,14 @@ template <>
inline
__host__
__device__
float
type_convert
<
float
,
f4_t
>
(
f4_t
x
)
inline
__host__
__device__
float
type_convert
<
float
,
f4_t
>
(
f4_t
x
)
{
{
#if defined(__gfx950__)
#if defined(__gfx950__)
float
scale
=
1.0
f
;
union
return
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
x
,
scale
,
0
)
{
float
float_array
[
2
];
float2_t
float2_array
;
}
float_values
{};
float
scale
=
1.0
f
;
float_values
.
float2_array
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
x
,
scale
,
0
);
return
float_values
.
float_array
[
0
];
#else
#else
return
utils
::
to_float
<
f4_t
>
(
NumericLimits
<
e8m0_scale_t
>::
Binary_1
(),
x
);
return
utils
::
to_float
<
f4_t
>
(
NumericLimits
<
e8m0_scale_t
>::
Binary_1
(),
x
);
#endif
#endif
...
@@ -1240,8 +1246,14 @@ template <>
...
@@ -1240,8 +1246,14 @@ template <>
inline
__host__
__device__
float
scaled_type_convert
<
float
,
f4_t
>
(
e8m0_scale_t
scale
,
f4_t
x
)
inline
__host__
__device__
float
scaled_type_convert
<
float
,
f4_t
>
(
e8m0_scale_t
scale
,
f4_t
x
)
{
{
#if defined(__gfx950__)
#if defined(__gfx950__)
return
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
x
,
type_convert
<
float
>
(
scale
),
0
)
union
.
template
AsType
<
float
>()(
Number
<
0
>
{});
{
float
float_array
[
2
];
float2_t
float2_array
;
}
float_values
{};
float_values
.
float2_array
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
x
,
type_convert
<
float
>
(
scale
),
0
);
return
float_values
.
float_array
[
0
];
#else
#else
return
utils
::
to_float
<
f4_t
>
(
scale
,
x
);
return
utils
::
to_float
<
f4_t
>
(
scale
,
x
);
#endif
#endif
...
@@ -1253,7 +1265,13 @@ inline __host__ __device__ float2_t scaled_type_convert<float2_t, f4x2_t>(e8m0_s
...
@@ -1253,7 +1265,13 @@ inline __host__ __device__ float2_t scaled_type_convert<float2_t, f4x2_t>(e8m0_s
f4x2_t
x
)
f4x2_t
x
)
{
{
#if defined(__gfx950__)
#if defined(__gfx950__)
return
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
x
,
type_convert
<
float
>
(
scale
),
0
);
union
{
uint32_t
bitwise
;
f4x2_t
f4x2_array
[
4
];
}
value
{};
value
.
f4x2_array
[
0
]
=
x
;
return
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
#else
#else
float2_t
ret
{
utils
::
to_float
<
f4_t
>
(
scale
,
x
.
unpack
(
1
)),
float2_t
ret
{
utils
::
to_float
<
f4_t
>
(
scale
,
x
.
unpack
(
1
)),
utils
::
to_float
<
f4_t
>
(
scale
,
x
.
unpack
(
0
))};
utils
::
to_float
<
f4_t
>
(
scale
,
x
.
unpack
(
0
))};
...
@@ -1272,58 +1290,107 @@ inline __host__ __device__ float32_t scaled_type_convert<float32_t, f4x32_t>(e8m
...
@@ -1272,58 +1290,107 @@ inline __host__ __device__ float32_t scaled_type_convert<float32_t, f4x32_t>(e8m
f4x32_t
f4x32_array
;
f4x32_t
f4x32_array
;
f4x2_t
fp4x2
[
16
];
f4x2_t
fp4x2
[
16
];
}
value
{
x
};
}
value
{
x
};
union
{
uint32_t
bitwise
;
f4x2_t
f4x2_array
[
4
];
}
bitwise_value
{};
float2_t
op
;
float2_t
op
;
float32_t
ret
;
float32_t
ret
;
// TODO: pack in a loop
// TODO: pack in a loop
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
0
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
0
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
0
]
=
op
[
0
];
ret
[
0
]
=
op
[
0
];
ret
[
1
]
=
op
[
1
];
ret
[
1
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
1
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
2
]
=
op
[
0
];
ret
[
2
]
=
op
[
0
];
ret
[
3
]
=
op
[
1
];
ret
[
3
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
2
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
2
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
4
]
=
op
[
0
];
ret
[
4
]
=
op
[
0
];
ret
[
5
]
=
op
[
1
];
ret
[
5
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
3
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
3
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
6
]
=
op
[
0
];
ret
[
6
]
=
op
[
0
];
ret
[
7
]
=
op
[
1
];
ret
[
7
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
4
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
4
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
8
]
=
op
[
0
];
ret
[
8
]
=
op
[
0
];
ret
[
9
]
=
op
[
1
];
ret
[
9
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
5
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
5
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
10
]
=
op
[
0
];
ret
[
10
]
=
op
[
0
];
ret
[
11
]
=
op
[
1
];
ret
[
11
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
6
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
6
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
12
]
=
op
[
0
];
ret
[
12
]
=
op
[
0
];
ret
[
13
]
=
op
[
1
];
ret
[
13
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
7
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
7
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
14
]
=
op
[
0
];
ret
[
14
]
=
op
[
0
];
ret
[
15
]
=
op
[
1
];
ret
[
15
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
8
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
8
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
16
]
=
op
[
0
];
ret
[
16
]
=
op
[
0
];
ret
[
17
]
=
op
[
1
];
ret
[
17
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
9
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
9
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
18
]
=
op
[
0
];
ret
[
18
]
=
op
[
0
];
ret
[
19
]
=
op
[
1
];
ret
[
19
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
10
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
10
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
20
]
=
op
[
0
];
ret
[
20
]
=
op
[
0
];
ret
[
21
]
=
op
[
1
];
ret
[
21
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
11
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
11
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
22
]
=
op
[
0
];
ret
[
22
]
=
op
[
0
];
ret
[
23
]
=
op
[
1
];
ret
[
23
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
12
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
12
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
24
]
=
op
[
0
];
ret
[
24
]
=
op
[
0
];
ret
[
25
]
=
op
[
1
];
ret
[
25
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
13
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
13
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
26
]
=
op
[
0
];
ret
[
26
]
=
op
[
0
];
ret
[
27
]
=
op
[
1
];
ret
[
27
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
14
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
14
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
28
]
=
op
[
0
];
ret
[
28
]
=
op
[
0
];
ret
[
29
]
=
op
[
1
];
ret
[
29
]
=
op
[
1
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
value
.
fp4x2
[
15
],
type_convert
<
float
>
(
scale
),
0
);
bitwise_value
.
f4x2_array
[
0
]
=
value
.
fp4x2
[
15
];
op
=
__builtin_amdgcn_cvt_scalef32_pk_f32_fp4
(
bitwise_value
.
bitwise
,
type_convert
<
float
>
(
scale
),
0
);
ret
[
30
]
=
op
[
0
];
ret
[
30
]
=
op
[
0
];
ret
[
31
]
=
op
[
1
];
ret
[
31
]
=
op
[
1
];
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment