Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
28f93c95
Commit
28f93c95
authored
Feb 06, 2025
by
Jiming Ruan
Browse files
add layernorm
parent
b426b99a
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
45 additions
and
26 deletions
+45
-26
example/ck_tile/02_layernorm2d/generate.py
example/ck_tile/02_layernorm2d/generate.py
+2
-2
example/ck_tile/10_rmsnorm2d/generate.py
example/ck_tile/10_rmsnorm2d/generate.py
+1
-1
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
...ayernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
+42
-23
No files found.
example/ck_tile/02_layernorm2d/generate.py
View file @
28f93c95
...
...
@@ -564,9 +564,9 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
4
,
1
,
512
,
4
,
True
,
False
,
True
,
True
,
False
,
0
,
0
,
0
),
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
4
,
1
,
1024
,
2
,
True
,
False
,
True
,
True
,
False
,
0
,
0
,
0
),
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
8
,
1
,
1024
,
1
,
True
,
False
,
True
,
True
,
False
,
0
,
0
,
0
)],
'big'
:[
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
2
,
1
,
256
,
8
,
True
,
False
,
True
,
True
,
True
,
0
,
0
,
0
),
'big'
:[
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
1
,
1
,
1024
,
8
,
True
,
False
,
True
,
True
,
True
,
0
,
0
,
0
),
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
4
,
1
,
256
,
4
,
True
,
False
,
True
,
True
,
True
,
0
,
0
,
0
),
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
2
,
1
,
1024
,
2
,
True
,
False
,
True
,
True
,
True
,
0
,
0
,
0
),
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
1
2
,
1
,
256
,
2
,
True
,
False
,
True
,
True
,
True
,
0
,
0
,
0
),
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
4
,
1
,
1024
,
1
,
True
,
False
,
True
,
True
,
True
,
0
,
0
,
0
)]}
total_blob
=
list
()
for
hs_key
in
h_trait_dict
:
...
...
example/ck_tile/10_rmsnorm2d/generate.py
View file @
28f93c95
...
...
@@ -538,7 +538,7 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
'big'
:[
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
1
,
1
,
1024
,
8
,
True
,
False
,
True
,
0
,
0
),
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
4
,
1
,
256
,
4
,
True
,
False
,
True
,
0
,
0
),
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
12
,
1
,
256
,
2
,
True
,
False
,
True
,
0
,
0
),
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
16
,
1
,
256
,
1
,
True
,
False
,
True
,
0
,
0
)]}
h_traits
(
'x'
,
'y'
,
'xs'
,
'ys'
,
1
,
4
,
1
,
1024
,
1
,
True
,
False
,
True
,
0
,
0
)]}
total_blob
=
list
()
for
hs_key
in
h_trait_dict
:
hs
=
h_trait_dict
[
hs_key
]
...
...
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
View file @
28f93c95
...
...
@@ -182,9 +182,16 @@ struct Layernorm2dFwdPipelineTwoPass
ck_tile
::
index_t
stride_to_right_most_window
=
row_size
%
Block_N
==
0
?
row_size
-
Block_N
:
row_size
-
row_size
%
Block_N
;
move_tile_window
(
x_window
,
{
0
,
-
Block_N
});
move_tile_window
(
x_residual_window
,
{
0
,
-
Block_N
});
move_tile_window
(
x_bias_window
,
{
-
Block_N
});
if
constexpr
(
kFusedAdd
==
Layernorm2dFusedAddEnum
::
PRE_ADD_STORE
)
{
move_tile_window
(
y_residual_window
,
{
0
,
-
Block_N
});
}
else
{
move_tile_window
(
x_window
,
{
0
,
-
Block_N
});
move_tile_window
(
x_residual_window
,
{
0
,
-
Block_N
});
move_tile_window
(
x_bias_window
,
{
-
Block_N
});
}
move_tile_window
(
gamma_window
,
{
stride_to_right_most_window
});
move_tile_window
(
beta_window
,
{
stride_to_right_most_window
});
move_tile_window
(
y_window
,
{
0
,
stride_to_right_most_window
});
...
...
@@ -192,28 +199,43 @@ struct Layernorm2dFwdPipelineTwoPass
// layernorm computation
for
(
int
iN
=
__builtin_amdgcn_readfirstlane
(
0
);
iN
<
num_n_tile_iteration
;
++
iN
)
{
auto
x
=
load_tile
(
x_window
);
auto
x_resi
=
load_tile
(
x_residual_window
);
const
auto
x_bias
=
load_tile
(
x_bias_window
);
auto
acc
=
cast_tile
<
ComputeDataType
>
(
x
);
auto
acc
=
make_static_distributed_tensor
<
ComputeDataType
>
(
decltype
(
load_tile
(
x_window
))
::
get_tile_distribution
());
if
constexpr
(
k
Xbias
==
Layernorm2d
XBiasEnum
::
ADD_BIAS
)
if
constexpr
(
k
FusedAdd
==
Layernorm2d
FusedAddEnum
::
PRE_ADD_STORE
)
{
sweep_tile
(
x
,
[
&
](
auto
idx
)
{
// compute x = bias + x
constexpr
auto
j_idx
=
make_tuple
(
idx
[
number
<
1
>
{}]);
acc
(
idx
)
=
type_convert
<
ComputeDataType
>
(
x_bias
[
j_idx
])
+
acc
(
idx
);
});
acc
=
cast_tile
<
ComputeDataType
>
(
load_tile
(
y_residual_window
));
move_tile_window
(
y_residual_window
,
{
0
,
-
Block_N
});
}
if
constexpr
(
kFusedAdd
==
Layernorm2dFusedAddEnum
::
PRE_ADD_STORE
||
kFusedAdd
==
Layernorm2dFusedAddEnum
::
PRE_ADD
)
else
{
sweep_tile
(
x_resi
,
[
&
](
auto
idx
)
{
// compute x = x_resi + x
acc
(
idx
)
=
type_convert
<
ComputeDataType
>
(
x_resi
(
idx
))
+
acc
(
idx
);
});
acc
=
cast_tile
<
ComputeDataType
>
(
load_tile
(
x_window
));
move_tile_window
(
x_window
,
{
0
,
-
Block_N
});
if
constexpr
(
kXbias
==
Layernorm2dXBiasEnum
::
ADD_BIAS
)
{
const
auto
x_bias
=
load_tile
(
x_bias_window
);
move_tile_window
(
x_bias_window
,
{
-
Block_N
});
sweep_tile
(
acc
,
[
&
](
auto
idx
)
{
// compute x = bias + x
constexpr
auto
j_idx
=
make_tuple
(
idx
[
number
<
1
>
{}]);
acc
(
idx
)
=
type_convert
<
ComputeDataType
>
(
x_bias
[
j_idx
])
+
acc
(
idx
);
});
}
if
constexpr
(
kFusedAdd
==
Layernorm2dFusedAddEnum
::
PRE_ADD
)
{
auto
x_resi
=
load_tile
(
x_residual_window
);
move_tile_window
(
x_residual_window
,
{
0
,
-
Block_N
});
sweep_tile
(
x_resi
,
[
&
](
auto
idx
)
{
// compute x = x_resi + x
acc
(
idx
)
=
type_convert
<
ComputeDataType
>
(
x_resi
(
idx
))
+
acc
(
idx
);
});
}
}
// load gamma/beta (TODO: support no gamma/beta?)
const
auto
gamma
=
load_tile
(
gamma_window
);
const
auto
beta
=
load_tile
(
beta_window
);
...
...
@@ -235,9 +257,6 @@ struct Layernorm2dFwdPipelineTwoPass
static_assert
(
kFusedQuant
!=
Layernorm2dFusedQuantEnum
::
DYNAMIC_QUANT
);
Epilogue
{}(
y_window
,
ln
);
move_tile_window
(
x_window
,
{
0
,
-
Block_N
});
move_tile_window
(
x_residual_window
,
{
0
,
-
Block_N
});
move_tile_window
(
x_bias_window
,
{
-
Block_N
});
move_tile_window
(
gamma_window
,
{
-
Block_N
});
move_tile_window
(
beta_window
,
{
-
Block_N
});
move_tile_window
(
y_window
,
{
0
,
-
Block_N
});
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment