Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
TransformerEngine
Commits
ab9d7598
Commit
ab9d7598
authored
May 06, 2025
by
yuguo
Browse files
Merge branch 'main' of
http://10.6.10.68/dcutoolkit/deeplearing/TransformerEngine
parents
229be5e8
6efebcd0
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
9 additions
and
9 deletions
+9
-9
transformer_engine/common/gemm/rocm_gemm.cu
transformer_engine/common/gemm/rocm_gemm.cu
+9
-9
No files found.
transformer_engine/common/gemm/rocm_gemm.cu
View file @
ab9d7598
...
...
@@ -115,7 +115,7 @@ float gelu_forward(float x)
template
<
typename
T
,
int
THREADS_PER_BLOCK
>
__global__
void
gelu_forward_kernel
(
const
float
*
in
,
T
*
out
,
float
*
amax
,
const
float
*
scale
,
int
m
,
int
n
)
{
void
__launch_bounds__
(
THREADS_PER_BLOCK
)
gelu_forward_kernel
(
const
float
*
in
,
T
*
out
,
float
*
amax
,
const
float
*
scale
,
int
m
,
int
n
)
{
// fp8 output flow
if
constexpr
(
std
::
is_same
<
T
,
fp8e4m3
>::
value
||
std
::
is_same
<
T
,
fp8e5m2
>::
value
){
typedef
hipcub
::
BlockReduce
<
float
,
THREADS_PER_BLOCK
>
BlockReduce
;
...
...
@@ -184,15 +184,15 @@ void gelu_backward_kernel(const float* dy, T* out, const Taux* __restrict pre_ge
template
<
typename
T
,
typename
Taux
>
void
gelu_backward_kernelLauncher
(
const
float
*
in
,
T
*
out
,
const
Taux
*
pre_gelu_out
,
int
m
,
int
n
,
hipStream_t
stream
)
{
int
blocks_per_row
=
ceil
(
float
(
n
)
/
1024
);
int
blocks_per_row
=
ceil
(
float
(
n
)
/
256
);
dim3
grid
(
min
(
m
*
blocks_per_row
,
65536
));
dim3
block
(
min
(
n
,
1024
));
dim3
block
(
min
(
n
,
256
));
hipLaunchKernelGGL
((
gelu_backward_kernel
<
T
,
Taux
>
),
dim3
(
grid
),
dim3
(
block
),
0
,
stream
,
in
,
out
,
pre_gelu_out
,
m
,
n
);
}
template
<
typename
T
,
typename
Tb
,
int
THREADS_PER_BLOCK
>
__global__
void
add_bias_kernel
(
const
float
*
in
,
T
*
out
,
const
Tb
*
__restrict
bias
,
float
*
amax
,
const
float
*
scale
,
int
m
,
int
n
){
void
__launch_bounds__
(
THREADS_PER_BLOCK
)
add_bias_kernel
(
const
float
*
in
,
T
*
out
,
const
Tb
*
__restrict
bias
,
float
*
amax
,
const
float
*
scale
,
int
m
,
int
n
){
// fp8 output flow
if
constexpr
(
std
::
is_same
<
T
,
fp8e4m3
>::
value
||
std
::
is_same
<
T
,
fp8e5m2
>::
value
){
typedef
hipcub
::
BlockReduce
<
float
,
THREADS_PER_BLOCK
>
BlockReduce
;
...
...
@@ -232,7 +232,7 @@ void add_bias_kernelLauncher(const float* in, T* out, const Tb* __restrict bias,
template
<
typename
T
,
typename
Taux
,
typename
Tb
,
int
THREADS_PER_BLOCK
>
__global__
void
add_bias_gelu_kernel
(
const
float
*
in
,
T
*
out
,
Taux
*
pre_gelu_out
,
const
Tb
*
__restrict
bias
,
float
*
amax
,
const
float
*
scale
,
int
m
,
int
n
){
void
__launch_bounds__
(
THREADS_PER_BLOCK
)
add_bias_gelu_kernel
(
const
float
*
in
,
T
*
out
,
Taux
*
pre_gelu_out
,
const
Tb
*
__restrict
bias
,
float
*
amax
,
const
float
*
scale
,
int
m
,
int
n
){
// fp8 output flow
if
constexpr
(
std
::
is_same
<
T
,
fp8e4m3
>::
value
||
std
::
is_same
<
T
,
fp8e5m2
>::
value
){
// only need to deal with amax and scale of D, no need to deal with amax and scale of pre_gelu_out
...
...
@@ -288,14 +288,14 @@ void identity_kernel(const Tin* in, T* out, int n) {
template
<
typename
Tin
,
typename
T
>
void
identity_kernelLauncher
(
const
Tin
*
in
,
T
*
out
,
int
n
,
hipStream_t
stream
)
{
dim3
block
,
grid
;
block
.
x
=
1024
;
grid
.
x
=
ceil
(
n
/
1024
.
);
block
.
x
=
256
;
grid
.
x
=
ceil
(
n
/
256
.
);
hipLaunchKernelGGL
((
identity_kernel
<
Tin
,
T
>
),
dim3
(
grid
),
dim3
(
block
),
0
,
stream
,
in
,
out
,
n
);
}
template
<
typename
T
,
int
THREADS_PER_BLOCK
>
__global__
void
identity_output_kernel
(
const
float
*
in
,
T
*
out
,
float
*
amax
,
const
float
*
scale
,
int
n
)
{
void
__launch_bounds__
(
THREADS_PER_BLOCK
)
identity_output_kernel
(
const
float
*
in
,
T
*
out
,
float
*
amax
,
const
float
*
scale
,
int
n
)
{
if
constexpr
(
std
::
is_same
<
T
,
fp8e4m3
>::
value
||
std
::
is_same
<
T
,
fp8e5m2
>::
value
){
typedef
hipcub
::
BlockReduce
<
float
,
THREADS_PER_BLOCK
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
block_temp_storage
;
...
...
@@ -331,7 +331,7 @@ void identity_output_kernelLauncher(const float* in, T* out, float* amax, const
template
<
typename
Tin
,
int
THREADS_PER_BLOCK
>
__global__
void
bias_gradient_kernel
(
const
Tin
*
in
,
float
*
out
,
int
m
,
int
n
)
{
void
__launch_bounds__
(
THREADS_PER_BLOCK
)
bias_gradient_kernel
(
const
Tin
*
in
,
float
*
out
,
int
m
,
int
n
)
{
typedef
hipcub
::
BlockReduce
<
float
,
THREADS_PER_BLOCK
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
block_temp_storage
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment