Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
7fb9b2b6
"tests/pipelines/test_pipelines_common.py" did not exist on "2b4f849db9cfd73c5c367b2ac124c8e48ef32430"
Commit
7fb9b2b6
authored
Oct 30, 2024
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into ck_tile/layernorm_fusion
parents
50f67a66
3d609534
Changes
102
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
737 additions
and
123 deletions
+737
-123
example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+26
-15
example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+27
-15
example/ck_tile/05_reduce/reduce.cpp
example/ck_tile/05_reduce/reduce.cpp
+35
-30
example/ck_tile/05_reduce/reduce.hpp
example/ck_tile/05_reduce/reduce.hpp
+109
-63
example/ck_tile/10_rmsnorm2d/CMakeLists.txt
example/ck_tile/10_rmsnorm2d/CMakeLists.txt
+25
-0
example/ck_tile/10_rmsnorm2d/README.md
example/ck_tile/10_rmsnorm2d/README.md
+22
-0
example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
+165
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp
+153
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp
...rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp
+22
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1536_instance.cpp
...rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1536_instance.cpp
+13
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n2048_instance.cpp
...rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n2048_instance.cpp
+14
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n256_instance.cpp
..._rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n256_instance.cpp
+12
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n3072_instance.cpp
...rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n3072_instance.cpp
+14
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_instance.cpp
...rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_instance.cpp
+14
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp
...norm2d/instances/rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp
+14
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n512_instance.cpp
..._rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n512_instance.cpp
+13
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n64_n128_instance.cpp
...norm2d/instances/rmsnorm2d_fwd_bf16_n64_n128_instance.cpp
+12
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n768_instance.cpp
..._rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n768_instance.cpp
+12
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1024_instance.cpp
...rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1024_instance.cpp
+22
-0
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1536_instance.cpp
...rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1536_instance.cpp
+13
-0
No files found.
example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
View file @
7fb9b2b6
...
@@ -21,6 +21,14 @@ DTYPE_BITS = {
...
@@ -21,6 +21,14 @@ DTYPE_BITS = {
"bf8"
:
8
"bf8"
:
8
}
}
K0_MAX_SUBMAX_MAP
=
{
32
:
32
,
64
:
64
,
96
:
128
,
128
:
128
,
256
:
256
}
TILE_PARTITIONER_MAP
=
{
TILE_PARTITIONER_MAP
=
{
"shb"
:
"ck_tile::FmhaFwdTilePartitioner_SHB"
,
"shb"
:
"ck_tile::FmhaFwdTilePartitioner_SHB"
,
"hbs"
:
"ck_tile::FmhaFwdTilePartitioner_HBS"
,
"hbs"
:
"ck_tile::FmhaFwdTilePartitioner_HBS"
,
...
@@ -35,7 +43,7 @@ FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
...
@@ -35,7 +43,7 @@ FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
FMHA_FWD_KERNEL_BODY
=
"""
FMHA_FWD_KERNEL_BODY
=
"""
using fmha_dtype_{F_idx} = {F_dtype};
using fmha_dtype_{F_idx} = {F_dtype};
using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0
blen
}>;
using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0
max
}>;
using fmha_warp_tile_{F_idx} = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
using fmha_warp_tile_{F_idx} = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
...
@@ -87,7 +95,7 @@ using fmha_kernel_{F_idx} =
...
@@ -87,7 +95,7 @@ using fmha_kernel_{F_idx} =
fmha_pipeline_{F_idx},
fmha_pipeline_{F_idx},
fmha_epilogue_{F_idx}>;
fmha_epilogue_{F_idx}>;
using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0
blen
}, {F_vlayout},
using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0
max
}, {F_vlayout},
{F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
{F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
#include <iostream>
#include <iostream>
...
@@ -125,7 +133,7 @@ FMHA_FWD_API_PER_HDIM_CASE=""" {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
...
@@ -125,7 +133,7 @@ FMHA_FWD_API_PER_HDIM_CASE=""" {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
FMHA_FWD_API_INNER_DISPATCH
=
""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
FMHA_FWD_API_INNER_DISPATCH
=
""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0
blen
}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0
max
}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
return fmha_fwd_<trait_>(s, a);
return fmha_fwd_<trait_>(s, a);
}}
}}
"""
"""
...
@@ -142,7 +150,7 @@ class FmhaFwdApiTrait:
...
@@ -142,7 +150,7 @@ class FmhaFwdApiTrait:
bk0
:
int
# tile size along qk gemm unroll
bk0
:
int
# tile size along qk gemm unroll
bn1
:
int
# tile size along v head_dim
bn1
:
int
# tile size along v head_dim
bk1
:
int
# tile size along kv gemm unroll
bk1
:
int
# tile size along kv gemm unroll
bk0
blen
:
int
bk0
max
:
int
vlayout
:
str
vlayout
:
str
mask
:
str
mask
:
str
bias
:
str
#
bias
:
str
#
...
@@ -156,7 +164,7 @@ class FmhaFwdApiTrait:
...
@@ -156,7 +164,7 @@ class FmhaFwdApiTrait:
@
property
@
property
def
name
(
self
)
->
str
:
def
name
(
self
)
->
str
:
return
f
'
{
self
.
hdim
}
-
{
self
.
dtype
}
-
{
self
.
mode
}
-
{
self
.
bm0
}
-
{
self
.
bn0
}
-
{
self
.
bk0
}
-
{
self
.
bn0
}
-
{
self
.
bk1
}
-
{
self
.
bk0
blen
}
-'
+
\
return
f
'
{
self
.
hdim
}
-
{
self
.
dtype
}
-
{
self
.
mode
}
-
{
self
.
bm0
}
-
{
self
.
bn0
}
-
{
self
.
bk0
}
-
{
self
.
bn0
}
-
{
self
.
bk1
}
-
{
self
.
bk0
max
}
-'
+
\
f
'
{
self
.
vlayout
}
-
{
self
.
mask
}
-
{
self
.
bias
}
-
{
self
.
lse
}
-
{
self
.
dropout
}
-
{
self
.
squant
}
-
{
self
.
spad
}
-
{
self
.
skpad
}
-
{
self
.
dpad
}
-
{
self
.
dvpad
}
'
f
'
{
self
.
vlayout
}
-
{
self
.
mask
}
-
{
self
.
bias
}
-
{
self
.
lse
}
-
{
self
.
dropout
}
-
{
self
.
squant
}
-
{
self
.
spad
}
-
{
self
.
skpad
}
-
{
self
.
dpad
}
-
{
self
.
dvpad
}
'
@
property
@
property
...
@@ -188,8 +196,9 @@ class FmhaFwdApiTrait:
...
@@ -188,8 +196,9 @@ class FmhaFwdApiTrait:
if
self
.
dpad
==
't'
:
return
f
'a.hdim_q %
{
vec
}
== 0'
if
self
.
dpad
==
't'
:
return
f
'a.hdim_q %
{
vec
}
== 0'
else
:
assert
False
else
:
assert
False
elif
self
.
pipeline_tag
in
[
'qr'
]:
elif
self
.
pipeline_tag
in
[
'qr'
]:
if
self
.
dpad
==
't'
:
return
f
'true /*a.hdim_q %
{
self
.
bk0blen
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
bk0submax
=
K0_MAX_SUBMAX_MAP
[
self
.
bk0max
]
else
:
return
f
'a.hdim_q %
{
self
.
bk0blen
}
== 0'
if
self
.
dpad
==
't'
:
return
f
'true /*a.hdim_q %
{
bk0submax
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
else
:
return
f
'a.hdim_q %
{
bk0submax
}
== 0'
else
:
assert
False
else
:
assert
False
@
property
@
property
...
@@ -199,8 +208,9 @@ class FmhaFwdApiTrait:
...
@@ -199,8 +208,9 @@ class FmhaFwdApiTrait:
if
self
.
dvpad
==
't'
:
return
f
'a.hdim_v %
{
vec
}
== 0'
if
self
.
dvpad
==
't'
:
return
f
'a.hdim_v %
{
vec
}
== 0'
else
:
assert
False
else
:
assert
False
elif
self
.
pipeline_tag
in
[
'qr'
]:
elif
self
.
pipeline_tag
in
[
'qr'
]:
if
self
.
dvpad
==
't'
:
return
f
'true /*a.hdim_v %
{
self
.
bk0blen
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
bk0submax
=
K0_MAX_SUBMAX_MAP
[
self
.
bk0max
]
else
:
return
f
'a.hdim_v %
{
self
.
bk0blen
}
== 0'
if
self
.
dvpad
==
't'
:
return
f
'true /*a.hdim_v %
{
bk0submax
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
else
:
return
f
'a.hdim_v %
{
bk0submax
}
== 0'
else
:
assert
False
else
:
assert
False
@
dataclass
@
dataclass
...
@@ -271,7 +281,7 @@ class FmhaFwdApiPool:
...
@@ -271,7 +281,7 @@ class FmhaFwdApiPool:
F_lse
=
BOOL_MAP
[
trait
.
lse
],
F_dropout
=
BOOL_MAP
[
trait
.
dropout
]
,
F_lse
=
BOOL_MAP
[
trait
.
lse
],
F_dropout
=
BOOL_MAP
[
trait
.
dropout
]
,
F_squant
=
BOOL_MAP
[
trait
.
squant
],
F_scheck
=
trait
.
scheck
,
F_skcheck
=
trait
.
skcheck
,
F_dcheck
=
trait
.
dcheck
,
F_dvcheck
=
trait
.
dvcheck
,
F_squant
=
BOOL_MAP
[
trait
.
squant
],
F_scheck
=
trait
.
scheck
,
F_skcheck
=
trait
.
skcheck
,
F_dcheck
=
trait
.
dcheck
,
F_dvcheck
=
trait
.
dvcheck
,
F_spad
=
BOOL_MAP
[
trait
.
spad
],
F_skpad
=
BOOL_MAP
[
trait
.
skpad
],
F_dpad
=
BOOL_MAP
[
trait
.
dpad
],
F_dvpad
=
BOOL_MAP
[
trait
.
dvpad
],
F_spad
=
BOOL_MAP
[
trait
.
spad
],
F_skpad
=
BOOL_MAP
[
trait
.
skpad
],
F_dpad
=
BOOL_MAP
[
trait
.
dpad
],
F_dvpad
=
BOOL_MAP
[
trait
.
dvpad
],
F_bm0
=
trait
.
bm0
,
F_bn0
=
trait
.
bn0
,
F_bk0
=
trait
.
bk0
,
F_bn1
=
trait
.
bn1
,
F_bk1
=
trait
.
bk1
,
F_bk0
blen
=
trait
.
bk0
blen
,
F_bm0
=
trait
.
bm0
,
F_bn0
=
trait
.
bn0
,
F_bk0
=
trait
.
bk0
,
F_bn1
=
trait
.
bn1
,
F_bk1
=
trait
.
bk1
,
F_bk0
max
=
trait
.
bk0
max
,
F_hdim
=
hdim
,
F_dtype
=
DTYPE_MAP
[
dtype
])
F_hdim
=
hdim
,
F_dtype
=
DTYPE_MAP
[
dtype
])
if_j
=
'if'
if
j
==
0
else
'else if'
if_j
=
'if'
if
j
==
0
else
'else if'
per_hdim_case
=
per_hdim_case
+
FMHA_FWD_API_PER_HDIM_CASE
.
format
(
F_if
=
if_j
,
F_hdim
=
hdim
,
F_inner_dispatch
=
inners
)
per_hdim_case
=
per_hdim_case
+
FMHA_FWD_API_PER_HDIM_CASE
.
format
(
F_if
=
if_j
,
F_hdim
=
hdim
,
F_inner_dispatch
=
inners
)
...
@@ -289,7 +299,7 @@ class FmhaFwdTileSize:
...
@@ -289,7 +299,7 @@ class FmhaFwdTileSize:
F_bk0
:
int
# tile size along qk gemm unroll
F_bk0
:
int
# tile size along qk gemm unroll
F_bn1
:
int
# tile size along v head_dim
F_bn1
:
int
# tile size along v head_dim
F_bk1
:
int
# tile size along kv gemm unroll
F_bk1
:
int
# tile size along kv gemm unroll
F_bk0
blen
:
int
# total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
F_bk0
max
:
int
# total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
F_rm0
:
int
# number of warps for gemm0 along q seqlen
F_rm0
:
int
# number of warps for gemm0 along q seqlen
F_rn0
:
int
# number of warps for gemm0 along k seqlen
F_rn0
:
int
# number of warps for gemm0 along k seqlen
F_rk0
:
int
# number of warps for gemm0 along head dim q (not used)
F_rk0
:
int
# number of warps for gemm0 along head dim q (not used)
...
@@ -302,7 +312,7 @@ class FmhaFwdTileSize:
...
@@ -302,7 +312,7 @@ class FmhaFwdTileSize:
F_occupancy
:
int
# occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
F_occupancy
:
int
# occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
@
property
@
property
def
name
(
self
)
->
str
:
def
name
(
self
)
->
str
:
return
f
"b
{
self
.
F_bm0
}
x
{
self
.
F_bn0
}
x
{
self
.
F_bk0
}
x
{
self
.
F_bn1
}
x
{
self
.
F_bk1
}
x
{
self
.
F_bk0
blen
}
"
+
\
return
f
"b
{
self
.
F_bm0
}
x
{
self
.
F_bn0
}
x
{
self
.
F_bk0
}
x
{
self
.
F_bn1
}
x
{
self
.
F_bk1
}
x
{
self
.
F_bk0
max
}
"
+
\
f
"_r
{
self
.
F_rm0
}
x
{
self
.
F_rn0
}
x
{
self
.
F_rk0
}
_r
{
self
.
F_rm1
}
x
{
self
.
F_rn1
}
x
{
self
.
F_rk1
}
"
+
\
f
"_r
{
self
.
F_rm0
}
x
{
self
.
F_rn0
}
x
{
self
.
F_rk0
}
_r
{
self
.
F_rm1
}
x
{
self
.
F_rn1
}
x
{
self
.
F_rk1
}
"
+
\
f
"_w
{
self
.
F_wm
}
x
{
self
.
F_wn
}
x
{
self
.
F_wk
}
"
+
(
""
if
self
.
F_occupancy
==
-
1
else
f
"_o
{
self
.
F_occupancy
}
"
)
f
"_w
{
self
.
F_wm
}
x
{
self
.
F_wn
}
x
{
self
.
F_wk
}
"
+
(
""
if
self
.
F_occupancy
==
-
1
else
f
"_o
{
self
.
F_occupancy
}
"
)
...
@@ -335,7 +345,7 @@ class FmhaFwdKernel:
...
@@ -335,7 +345,7 @@ class FmhaFwdKernel:
F_bk0
=
self
.
F_tile
.
F_bk0
,
F_bk0
=
self
.
F_tile
.
F_bk0
,
F_bn1
=
self
.
F_tile
.
F_bn1
,
F_bn1
=
self
.
F_tile
.
F_bn1
,
F_bk1
=
self
.
F_tile
.
F_bk1
,
F_bk1
=
self
.
F_tile
.
F_bk1
,
F_bk0
blen
=
self
.
F_tile
.
F_bk0
blen
,
F_bk0
max
=
self
.
F_tile
.
F_bk0
max
,
F_rm0
=
self
.
F_tile
.
F_rm0
,
F_rm0
=
self
.
F_tile
.
F_rm0
,
F_rn0
=
self
.
F_tile
.
F_rn0
,
F_rn0
=
self
.
F_tile
.
F_rn0
,
F_rk0
=
self
.
F_tile
.
F_rk0
,
F_rk0
=
self
.
F_tile
.
F_rk0
,
...
@@ -382,7 +392,7 @@ class FmhaFwdKernel:
...
@@ -382,7 +392,7 @@ class FmhaFwdKernel:
bk0
=
self
.
F_tile
.
F_bk0
,
bk0
=
self
.
F_tile
.
F_bk0
,
bn1
=
self
.
F_tile
.
F_bn1
,
bn1
=
self
.
F_tile
.
F_bn1
,
bk1
=
self
.
F_tile
.
F_bk1
,
bk1
=
self
.
F_tile
.
F_bk1
,
bk0
blen
=
self
.
F_tile
.
F_bk0
blen
,
bk0
max
=
self
.
F_tile
.
F_bk0
max
,
vlayout
=
self
.
F_pipeline
.
F_vlayout
,
vlayout
=
self
.
F_pipeline
.
F_vlayout
,
mask
=
self
.
F_pipeline
.
F_mask
,
mask
=
self
.
F_pipeline
.
F_mask
,
bias
=
self
.
F_pipeline
.
F_bias
,
bias
=
self
.
F_pipeline
.
F_bias
,
...
@@ -401,6 +411,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
...
@@ -401,6 +411,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
return
{
return
{
'32'
:
FmhaFwdTileSize
(
128
,
64
,
16
,
32
,
32
,
32
,
2
,
1
,
1
,
2
,
1
,
1
,
32
,
32
,
16
,
-
1
),
'32'
:
FmhaFwdTileSize
(
128
,
64
,
16
,
32
,
32
,
32
,
2
,
1
,
1
,
2
,
1
,
1
,
32
,
32
,
16
,
-
1
),
'64'
:
FmhaFwdTileSize
(
128
,
64
,
32
,
64
,
32
,
64
,
4
,
1
,
1
,
4
,
1
,
1
,
32
,
32
,
16
,
-
1
),
'64'
:
FmhaFwdTileSize
(
128
,
64
,
32
,
64
,
32
,
64
,
4
,
1
,
1
,
4
,
1
,
1
,
32
,
32
,
16
,
-
1
),
## '96' : FmhaFwdTileSize(128, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 32, 32, 16, -1),
'128'
:
FmhaFwdTileSize
(
128
,
128
,
32
,
128
,
32
,
128
,
4
,
1
,
1
,
4
,
1
,
1
,
32
,
32
,
16
,
-
1
),
'128'
:
FmhaFwdTileSize
(
128
,
128
,
32
,
128
,
32
,
128
,
4
,
1
,
1
,
4
,
1
,
1
,
32
,
32
,
16
,
-
1
),
'256'
:
FmhaFwdTileSize
(
128
,
128
,
32
,
256
,
32
,
256
,
4
,
1
,
1
,
4
,
1
,
1
,
32
,
32
,
16
,
-
1
),
'256'
:
FmhaFwdTileSize
(
128
,
128
,
32
,
256
,
32
,
256
,
4
,
1
,
1
,
4
,
1
,
1
,
32
,
32
,
16
,
-
1
),
}
}
...
@@ -510,4 +521,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im
...
@@ -510,4 +521,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im
_
,
kernels
=
get_fwd_blobs
(
kernel_filter
,
receipt
,
mask_impl
)
_
,
kernels
=
get_fwd_blobs
(
kernel_filter
,
receipt
,
mask_impl
)
for
kernel
in
kernels
:
for
kernel
in
kernels
:
f
.
write
(
str
(
file_path
.
parent
/
GEN_DIR
/
kernel
.
filename
)
+
"
\n
"
)
f
.
write
(
str
(
file_path
.
parent
/
GEN_DIR
/
kernel
.
filename
)
+
"
\n
"
)
f
.
write
(
str
(
file_path
.
parent
/
GEN_DIR
/
FMHA_FWD_API_FILENAME
)
+
"
\n
"
)
f
.
write
(
str
(
file_path
.
parent
/
GEN_DIR
/
FMHA_FWD_API_FILENAME
)
+
"
\n
"
)
\ No newline at end of file
example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
View file @
7fb9b2b6
...
@@ -29,6 +29,14 @@ DTYPE_BITS = {
...
@@ -29,6 +29,14 @@ DTYPE_BITS = {
"bf8"
:
8
"bf8"
:
8
}
}
K0_MAX_SUBMAX_MAP
=
{
32
:
32
,
64
:
64
,
96
:
128
,
128
:
128
,
256
:
256
}
FMHA_FWD_SPLITKV_PIPELINE_MAP
=
{
FMHA_FWD_SPLITKV_PIPELINE_MAP
=
{
"qr"
:
"ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS"
,
"qr"
:
"ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS"
,
"qr_async"
:
"ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync"
,
"qr_async"
:
"ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync"
,
...
@@ -41,7 +49,7 @@ using fmha_mask_{F_idx} = {F_mask};
...
@@ -41,7 +49,7 @@ using fmha_mask_{F_idx} = {F_mask};
namespace {{
namespace {{
template <bool kHasUnevenSplits>
template <bool kHasUnevenSplits>
struct kernel_runner {{
struct kernel_runner {{
using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0
blen
}>;
using fmha_block_tile = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0
max
}>;
using fmha_warp_tile = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
using fmha_warp_tile = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
using fmha_shape = ck_tile::TileFmhaShape<fmha_block_tile,
using fmha_shape = ck_tile::TileFmhaShape<fmha_block_tile,
...
@@ -103,7 +111,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
...
@@ -103,7 +111,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
}};
}};
}}
}}
using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0
blen
}, {F_vlayout},
using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0
max
}, {F_vlayout},
{F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad},
{F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad},
{F_dvpad}>;
{F_dvpad}>;
...
@@ -241,7 +249,7 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
...
@@ -241,7 +249,7 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
FMHA_FWD_SPLITKV_API_INNER_DISPATCH
=
""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.do_fp8_static_quant == {F_squant}) &&
FMHA_FWD_SPLITKV_API_INNER_DISPATCH
=
""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.do_fp8_static_quant == {F_squant}) &&
((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0
blen
}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0
max
}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>;
using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}/2, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>;
return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
...
@@ -260,7 +268,7 @@ class FmhaFwdSplitKVApiTrait:
...
@@ -260,7 +268,7 @@ class FmhaFwdSplitKVApiTrait:
bk0
:
int
# tile size along qk gemm unroll
bk0
:
int
# tile size along qk gemm unroll
bn1
:
int
# tile size along v head_dim
bn1
:
int
# tile size along v head_dim
bk1
:
int
# tile size along kv gemm unroll
bk1
:
int
# tile size along kv gemm unroll
bk0
blen
:
int
bk0
max
:
int
vlayout
:
str
vlayout
:
str
mask
:
str
mask
:
str
bias
:
str
#
bias
:
str
#
...
@@ -270,11 +278,11 @@ class FmhaFwdSplitKVApiTrait:
...
@@ -270,11 +278,11 @@ class FmhaFwdSplitKVApiTrait:
skpad
:
str
skpad
:
str
dpad
:
str
dpad
:
str
dvpad
:
str
dvpad
:
str
pagedkv
:
str
pagedkv
:
str
@
property
@
property
def
name
(
self
)
->
str
:
def
name
(
self
)
->
str
:
return
f
'
{
self
.
hdim
}
-
{
self
.
dtype
}
-
{
self
.
mode
}
-
{
self
.
bm0
}
-
{
self
.
bn0
}
-
{
self
.
bk0
}
-
{
self
.
bn0
}
-
{
self
.
bk1
}
-
{
self
.
bk0
blen
}
-'
+
\
return
f
'
{
self
.
hdim
}
-
{
self
.
dtype
}
-
{
self
.
mode
}
-
{
self
.
bm0
}
-
{
self
.
bn0
}
-
{
self
.
bk0
}
-
{
self
.
bn0
}
-
{
self
.
bk1
}
-
{
self
.
bk0
max
}
-'
+
\
f
'
{
self
.
vlayout
}
-
{
self
.
mask
}
-
{
self
.
bias
}
-
{
self
.
lse
}
-
{
self
.
squant
}
-
{
self
.
spad
}
-
{
self
.
skpad
}
-
{
self
.
dpad
}
-'
+
\
f
'
{
self
.
vlayout
}
-
{
self
.
mask
}
-
{
self
.
bias
}
-
{
self
.
lse
}
-
{
self
.
squant
}
-
{
self
.
spad
}
-
{
self
.
skpad
}
-
{
self
.
dpad
}
-'
+
\
f
'
{
self
.
dvpad
}
-
{
self
.
pagedkv
}
'
f
'
{
self
.
dvpad
}
-
{
self
.
pagedkv
}
'
...
@@ -307,8 +315,9 @@ class FmhaFwdSplitKVApiTrait:
...
@@ -307,8 +315,9 @@ class FmhaFwdSplitKVApiTrait:
if
self
.
dpad
==
't'
:
return
f
'a.hdim_q %
{
vec
}
== 0'
if
self
.
dpad
==
't'
:
return
f
'a.hdim_q %
{
vec
}
== 0'
else
:
assert
False
else
:
assert
False
elif
self
.
pipeline_tag
in
[
'qr'
]:
elif
self
.
pipeline_tag
in
[
'qr'
]:
if
self
.
dpad
==
't'
:
return
f
'true /*a.hdim_q %
{
self
.
bk0blen
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
bk0submax
=
K0_MAX_SUBMAX_MAP
[
self
.
bk0max
]
else
:
return
f
'a.hdim_q %
{
self
.
bk0blen
}
== 0'
if
self
.
dpad
==
't'
:
return
f
'true /*a.hdim_q %
{
bk0submax
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
else
:
return
f
'a.hdim_q %
{
bk0submax
}
== 0'
else
:
assert
False
else
:
assert
False
@
property
@
property
...
@@ -318,8 +327,9 @@ class FmhaFwdSplitKVApiTrait:
...
@@ -318,8 +327,9 @@ class FmhaFwdSplitKVApiTrait:
if
self
.
dvpad
==
't'
:
return
f
'a.hdim_v %
{
vec
}
== 0'
if
self
.
dvpad
==
't'
:
return
f
'a.hdim_v %
{
vec
}
== 0'
else
:
assert
False
else
:
assert
False
elif
self
.
pipeline_tag
in
[
'qr'
]:
elif
self
.
pipeline_tag
in
[
'qr'
]:
if
self
.
dvpad
==
't'
:
return
f
'true /*a.hdim_v %
{
self
.
bk0blen
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
bk0submax
=
K0_MAX_SUBMAX_MAP
[
self
.
bk0max
]
else
:
return
f
'a.hdim_v %
{
self
.
bk0blen
}
== 0'
if
self
.
dvpad
==
't'
:
return
f
'true /*a.hdim_v %
{
bk0submax
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
else
:
return
f
'a.hdim_v %
{
bk0submax
}
== 0'
else
:
assert
False
else
:
assert
False
@
dataclass
@
dataclass
...
@@ -414,7 +424,7 @@ class FmhaFwdSplitKVApiPool:
...
@@ -414,7 +424,7 @@ class FmhaFwdSplitKVApiPool:
F_lse
=
BOOL_MAP
[
trait
.
lse
],
F_squant
=
BOOL_MAP
[
trait
.
squant
],
F_pagedkv
=
BOOL_MAP
[
trait
.
pagedkv
],
F_lse
=
BOOL_MAP
[
trait
.
lse
],
F_squant
=
BOOL_MAP
[
trait
.
squant
],
F_pagedkv
=
BOOL_MAP
[
trait
.
pagedkv
],
F_scheck
=
trait
.
scheck
,
F_skcheck
=
trait
.
skcheck
,
F_dcheck
=
trait
.
dcheck
,
F_dvcheck
=
trait
.
dvcheck
,
F_scheck
=
trait
.
scheck
,
F_skcheck
=
trait
.
skcheck
,
F_dcheck
=
trait
.
dcheck
,
F_dvcheck
=
trait
.
dvcheck
,
F_spad
=
BOOL_MAP
[
trait
.
spad
],
F_skpad
=
BOOL_MAP
[
trait
.
skpad
],
F_dpad
=
BOOL_MAP
[
trait
.
dpad
],
F_dvpad
=
BOOL_MAP
[
trait
.
dvpad
],
F_spad
=
BOOL_MAP
[
trait
.
spad
],
F_skpad
=
BOOL_MAP
[
trait
.
skpad
],
F_dpad
=
BOOL_MAP
[
trait
.
dpad
],
F_dvpad
=
BOOL_MAP
[
trait
.
dvpad
],
F_bm0
=
trait
.
bm0
,
F_bn0
=
trait
.
bn0
,
F_bk0
=
trait
.
bk0
,
F_bn1
=
trait
.
bn1
,
F_bk1
=
trait
.
bk1
,
F_bk0
blen
=
trait
.
bk0
blen
,
F_bm0
=
trait
.
bm0
,
F_bn0
=
trait
.
bn0
,
F_bk0
=
trait
.
bk0
,
F_bn1
=
trait
.
bn1
,
F_bk1
=
trait
.
bk1
,
F_bk0
max
=
trait
.
bk0
max
,
F_hdim
=
hdim
,
F_dtype
=
DTYPE_MAP
[
dtype
])
F_hdim
=
hdim
,
F_dtype
=
DTYPE_MAP
[
dtype
])
if_j
=
'if'
if
j
==
0
else
'else if'
if_j
=
'if'
if
j
==
0
else
'else if'
per_hdim_case
=
per_hdim_case
+
FMHA_FWD_API_PER_HDIM_CASE
.
format
(
F_if
=
if_j
,
F_hdim
=
hdim
,
F_inner_dispatch
=
inners
)
per_hdim_case
=
per_hdim_case
+
FMHA_FWD_API_PER_HDIM_CASE
.
format
(
F_if
=
if_j
,
F_hdim
=
hdim
,
F_inner_dispatch
=
inners
)
...
@@ -458,7 +468,7 @@ class FmhaFwdSplitKVKernel:
...
@@ -458,7 +468,7 @@ class FmhaFwdSplitKVKernel:
F_bk0
=
self
.
F_tile
.
F_bk0
,
F_bk0
=
self
.
F_tile
.
F_bk0
,
F_bn1
=
self
.
F_tile
.
F_bn1
,
F_bn1
=
self
.
F_tile
.
F_bn1
,
F_bk1
=
self
.
F_tile
.
F_bk1
,
F_bk1
=
self
.
F_tile
.
F_bk1
,
F_bk0
blen
=
self
.
F_tile
.
F_bk0
blen
,
F_bk0
max
=
self
.
F_tile
.
F_bk0
max
,
F_rm0
=
self
.
F_tile
.
F_rm0
,
F_rm0
=
self
.
F_tile
.
F_rm0
,
F_rn0
=
self
.
F_tile
.
F_rn0
,
F_rn0
=
self
.
F_tile
.
F_rn0
,
F_rk0
=
self
.
F_tile
.
F_rk0
,
F_rk0
=
self
.
F_tile
.
F_rk0
,
...
@@ -504,7 +514,7 @@ class FmhaFwdSplitKVKernel:
...
@@ -504,7 +514,7 @@ class FmhaFwdSplitKVKernel:
bk0
=
self
.
F_tile
.
F_bk0
,
bk0
=
self
.
F_tile
.
F_bk0
,
bn1
=
self
.
F_tile
.
F_bn1
,
bn1
=
self
.
F_tile
.
F_bn1
,
bk1
=
self
.
F_tile
.
F_bk1
,
bk1
=
self
.
F_tile
.
F_bk1
,
bk0
blen
=
self
.
F_tile
.
F_bk0
blen
,
bk0
max
=
self
.
F_tile
.
F_bk0
max
,
vlayout
=
self
.
F_pipeline
.
F_vlayout
,
vlayout
=
self
.
F_pipeline
.
F_vlayout
,
mask
=
self
.
F_pipeline
.
F_mask
,
mask
=
self
.
F_pipeline
.
F_mask
,
bias
=
self
.
F_pipeline
.
F_bias
,
bias
=
self
.
F_pipeline
.
F_bias
,
...
@@ -559,6 +569,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
...
@@ -559,6 +569,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
return
{
return
{
'32'
:
FmhaFwdTileSize
(
32
,
64
,
16
,
32
,
32
,
32
,
2
,
1
,
1
,
2
,
1
,
1
,
16
,
16
,
16
,
-
1
),
'32'
:
FmhaFwdTileSize
(
32
,
64
,
16
,
32
,
32
,
32
,
2
,
1
,
1
,
2
,
1
,
1
,
16
,
16
,
16
,
-
1
),
'64'
:
FmhaFwdTileSize
(
64
,
64
,
32
,
64
,
32
,
64
,
4
,
1
,
1
,
4
,
1
,
1
,
16
,
16
,
16
,
-
1
),
'64'
:
FmhaFwdTileSize
(
64
,
64
,
32
,
64
,
32
,
64
,
4
,
1
,
1
,
4
,
1
,
1
,
16
,
16
,
16
,
-
1
),
## '96' : FmhaFwdTileSize(64, 128, 32, 128, 32, 96, 4, 1, 1, 4, 1, 1, 16, 16, 16, -1),
'128'
:
FmhaFwdTileSize
(
64
,
128
,
32
,
128
,
32
,
128
,
4
,
1
,
1
,
4
,
1
,
1
,
16
,
16
,
16
,
-
1
),
'128'
:
FmhaFwdTileSize
(
64
,
128
,
32
,
128
,
32
,
128
,
4
,
1
,
1
,
4
,
1
,
1
,
16
,
16
,
16
,
-
1
),
'256'
:
FmhaFwdTileSize
(
64
,
128
,
32
,
256
,
32
,
256
,
4
,
1
,
1
,
4
,
1
,
1
,
16
,
16
,
16
,
-
1
),
'256'
:
FmhaFwdTileSize
(
64
,
128
,
32
,
256
,
32
,
256
,
4
,
1
,
1
,
4
,
1
,
1
,
16
,
16
,
16
,
-
1
),
}
}
...
@@ -576,6 +587,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
...
@@ -576,6 +587,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
return
{
return
{
'32'
:
FmhaFwdSplitKVCombineTileSize
(
16
,
16
,
-
1
),
'32'
:
FmhaFwdSplitKVCombineTileSize
(
16
,
16
,
-
1
),
'64'
:
FmhaFwdSplitKVCombineTileSize
(
32
,
32
,
-
1
),
'64'
:
FmhaFwdSplitKVCombineTileSize
(
32
,
32
,
-
1
),
## '96' : FmhaFwdSplitKVCombineTileSize(32, 64, -1),
'128'
:
FmhaFwdSplitKVCombineTileSize
(
32
,
64
,
-
1
),
'128'
:
FmhaFwdSplitKVCombineTileSize
(
32
,
64
,
-
1
),
'256'
:
FmhaFwdSplitKVCombineTileSize
(
32
,
128
,
-
1
),
'256'
:
FmhaFwdSplitKVCombineTileSize
(
32
,
128
,
-
1
),
}
}
...
@@ -604,7 +616,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
...
@@ -604,7 +616,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
if
dtype
in
[
'fp16'
,
'bf16'
]:
if
dtype
in
[
'fp16'
,
'bf16'
]:
for
mask
,
bias
,
lse
,
pagedkv
in
itertools
.
product
(
get_mask_map
(
mask_impl
).
keys
(),
BIAS_MAP
.
keys
(),
[
"t"
,
"f"
],
[
"t"
,
"f"
]):
for
mask
,
bias
,
lse
,
pagedkv
in
itertools
.
product
(
get_mask_map
(
mask_impl
).
keys
(),
BIAS_MAP
.
keys
(),
[
"t"
,
"f"
],
[
"t"
,
"f"
]):
# TODO: use async pipeline when compiler is more stable
# TODO: use async pipeline when compiler is more stable
if
hdim
==
256
or
hdim
in
[
32
,
64
,
128
]:
if
hdim
==
256
or
hdim
in
[
32
,
64
,
128
]:
### [32, 64, 96, 128]:
# if True:
# if True:
pipelines
.
append
(
Pipeline
(
'qr'
,
'row'
,
'f'
,
't'
,
'f'
,
'f'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr'
,
'row'
,
'f'
,
't'
,
'f'
,
'f'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr'
,
'col'
,
'f'
,
't'
,
'f'
,
'f'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr'
,
'col'
,
'f'
,
't'
,
'f'
,
'f'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
...
@@ -743,4 +755,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im
...
@@ -743,4 +755,4 @@ def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_im
_
,
kernels
=
get_fwd_splitkv_blobs
(
kernel_filter
,
receipt
,
mask_impl
)
_
,
kernels
=
get_fwd_splitkv_blobs
(
kernel_filter
,
receipt
,
mask_impl
)
for
kernel
in
kernels
:
for
kernel
in
kernels
:
f
.
write
(
str
(
file_path
.
parent
/
GEN_DIR
/
kernel
.
filename
)
+
"
\n
"
)
f
.
write
(
str
(
file_path
.
parent
/
GEN_DIR
/
kernel
.
filename
)
+
"
\n
"
)
f
.
write
(
str
(
file_path
.
parent
/
GEN_DIR
/
FMHA_FWD_SPLITKV_API_FILENAME
)
+
"
\n
"
)
f
.
write
(
str
(
file_path
.
parent
/
GEN_DIR
/
FMHA_FWD_SPLITKV_API_FILENAME
)
+
"
\n
"
)
\ No newline at end of file
example/ck_tile/05_reduce/reduce.cpp
View file @
7fb9b2b6
...
@@ -19,9 +19,9 @@ auto create_args(int argc, char* argv[])
...
@@ -19,9 +19,9 @@ auto create_args(int argc, char* argv[])
template
<
typename
DataType
>
template
<
typename
DataType
>
bool
run
(
const
ck_tile
::
ArgParser
&
arg_parser
)
bool
run
(
const
ck_tile
::
ArgParser
&
arg_parser
)
{
{
using
A
DataType
=
DataType
;
using
X
DataType
=
DataType
;
using
Acc
DataType
=
float
;
using
Compute
DataType
=
float
;
using
B
DataType
=
DataType
;
using
Y
DataType
=
DataType
;
ck_tile
::
index_t
m
=
arg_parser
.
get_int
(
"m"
);
ck_tile
::
index_t
m
=
arg_parser
.
get_int
(
"m"
);
ck_tile
::
index_t
n
=
arg_parser
.
get_int
(
"n"
);
ck_tile
::
index_t
n
=
arg_parser
.
get_int
(
"n"
);
...
@@ -29,35 +29,39 @@ bool run(const ck_tile::ArgParser& arg_parser)
...
@@ -29,35 +29,39 @@ bool run(const ck_tile::ArgParser& arg_parser)
int
warmup
=
arg_parser
.
get_int
(
"warmup"
);
int
warmup
=
arg_parser
.
get_int
(
"warmup"
);
int
repeat
=
arg_parser
.
get_int
(
"repeat"
);
int
repeat
=
arg_parser
.
get_int
(
"repeat"
);
ck_tile
::
HostTensor
<
A
DataType
>
a
_host
({
m
,
n
});
ck_tile
::
HostTensor
<
X
DataType
>
x
_host
({
m
,
n
});
ck_tile
::
HostTensor
<
B
DataType
>
b
_host_ref
({
m
});
ck_tile
::
HostTensor
<
Y
DataType
>
y
_host_ref
({
m
});
ck_tile
::
HostTensor
<
B
DataType
>
b
_host_dev
({
m
});
ck_tile
::
HostTensor
<
Y
DataType
>
y
_host_dev
({
m
});
ck_tile
::
FillUniformDistribution
<
A
DataType
>
{
-
5.
f
,
5.
f
}(
a
_host
);
ck_tile
::
FillUniformDistribution
<
X
DataType
>
{
-
5.
f
,
5.
f
}(
x
_host
);
ck_tile
::
DeviceMem
a
_buf
(
a
_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
x
_buf
(
x
_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
b
_buf
(
b
_host_dev
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
y
_buf
(
y
_host_dev
.
get_element_space_size_in_bytes
());
a
_buf
.
ToDevice
(
a
_host
.
data
());
x
_buf
.
ToDevice
(
x
_host
.
data
());
using
ReduceOp
=
ck_tile
::
ReduceOp
::
Add
;
using
BlockWarps
=
ck_tile
::
sequence
<
4
,
1
>
;
using
BlockWarps
=
ck_tile
::
sequence
<
4
,
1
>
;
using
BlockTile
=
ck_tile
::
sequence
<
128
,
128
>
;
using
BlockTile
=
ck_tile
::
sequence
<
128
,
128
>
;
using
WarpTile
=
ck_tile
::
sequence
<
32
,
128
>
;
using
WarpTile
=
ck_tile
::
sequence
<
32
,
128
>
;
using
ThreadTile
=
ck_tile
::
sequence
<
8
,
8
>
;
using
Vector
=
ck_tile
::
sequence
<
8
,
8
>
;
constexpr
ck_tile
::
index_t
kBlockSize
=
256
;
// cross warp-reduce
// using BlockWarps = ck_tile::sequence<2, 2>;
// using BlockTile = ck_tile::sequence<2, 1024>;
// using WarpTile = ck_tile::sequence<1, 512>;
// using Vector = ck_tile::sequence<1, 8>;
constexpr
ck_tile
::
index_t
kBlockSize
=
512
;
constexpr
ck_tile
::
index_t
kBlockPerCu
=
1
;
constexpr
ck_tile
::
index_t
kBlockPerCu
=
1
;
ck_tile
::
index_t
kGridSize
=
(
m
/
BlockTile
::
at
(
ck_tile
::
number
<
0
>
{}));
ck_tile
::
index_t
kGridSize
=
(
m
/
BlockTile
::
at
(
ck_tile
::
number
<
0
>
{}));
std
::
cout
<<
"grid size "
<<
kGridSize
<<
std
::
endl
;
std
::
cout
<<
"grid size "
<<
kGridSize
<<
std
::
endl
;
using
Kernel
=
ck_tile
::
Reduce
<
ADataType
,
using
Shape
=
ck_tile
::
Reduce2dShape
<
BlockWarps
,
BlockTile
,
WarpTile
,
Vector
>
;
AccDataType
,
using
Porblem
=
BDataType
,
ck_tile
::
Reduce2dProblem
<
XDataType
,
ComputeDataType
,
YDataType
,
Shape
,
ReduceOp
>
;
kBlockSize
,
BlockWarps
,
using
Kernel
=
ck_tile
::
Reduce
<
Porblem
>
;
BlockTile
,
WarpTile
,
ThreadTile
>
;
float
ave_time
=
launch_kernel
(
ck_tile
::
stream_config
{
nullptr
,
true
,
0
,
warmup
,
repeat
},
float
ave_time
=
launch_kernel
(
ck_tile
::
stream_config
{
nullptr
,
true
,
0
,
warmup
,
repeat
},
ck_tile
::
make_kernel
<
kBlockSize
,
kBlockPerCu
>
(
ck_tile
::
make_kernel
<
kBlockSize
,
kBlockPerCu
>
(
...
@@ -65,12 +69,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
...
@@ -65,12 +69,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
kGridSize
,
kGridSize
,
kBlockSize
,
kBlockSize
,
0
,
0
,
static_cast
<
A
DataType
*>
(
a
_buf
.
GetDeviceBuffer
()),
static_cast
<
X
DataType
*>
(
x
_buf
.
GetDeviceBuffer
()),
static_cast
<
B
DataType
*>
(
b
_buf
.
GetDeviceBuffer
()),
static_cast
<
Y
DataType
*>
(
y
_buf
.
GetDeviceBuffer
()),
m
,
m
,
n
));
n
));
std
::
size_t
num_btype
=
sizeof
(
A
DataType
)
*
m
*
n
+
sizeof
(
B
DataType
)
*
m
;
std
::
size_t
num_btype
=
sizeof
(
X
DataType
)
*
m
*
n
+
sizeof
(
Y
DataType
)
*
m
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
...
@@ -81,9 +85,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
...
@@ -81,9 +85,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
if
(
do_validation
)
if
(
do_validation
)
{
{
// reference
// reference
ck_tile
::
reference_reduce
<
ADataType
,
AccDataType
,
BDataType
>
(
a_host
,
b_host_ref
);
ck_tile
::
reference_reduce
<
XDataType
,
ComputeDataType
,
YDataType
>
(
b_buf
.
FromDevice
(
b_host_dev
.
mData
.
data
());
x_host
,
y_host_ref
,
ReduceOp
{});
pass
=
ck_tile
::
check_err
(
b_host_dev
,
b_host_ref
);
y_buf
.
FromDevice
(
y_host_dev
.
mData
.
data
());
pass
=
ck_tile
::
check_err
(
y_host_dev
,
y_host_ref
);
std
::
cout
<<
"valid:"
<<
(
pass
?
"y"
:
"n"
)
<<
std
::
flush
<<
std
::
endl
;
std
::
cout
<<
"valid:"
<<
(
pass
?
"y"
:
"n"
)
<<
std
::
flush
<<
std
::
endl
;
}
}
...
@@ -103,8 +108,8 @@ int main(int argc, char* argv[])
...
@@ -103,8 +108,8 @@ int main(int argc, char* argv[])
{
{
return
run
<
ck_tile
::
half_t
>
(
arg_parser
)
?
0
:
-
2
;
return
run
<
ck_tile
::
half_t
>
(
arg_parser
)
?
0
:
-
2
;
}
}
if
(
data_type
==
"bf16"
)
// else
if(data_type == "bf16")
{
//
{
return
run
<
ck_tile
::
bf16_t
>
(
arg_parser
)
?
0
:
-
2
;
//
return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
}
//
}
}
}
example/ck_tile/05_reduce/reduce.hpp
View file @
7fb9b2b6
...
@@ -5,20 +5,16 @@
...
@@ -5,20 +5,16 @@
#include "ck_tile/core.hpp"
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/reduce/block/block_reduce.hpp"
#include "ck_tile/ops/reduce/block/block_reduce.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
namespace
ck_tile
{
namespace
ck_tile
{
template
<
typename
ADataType
,
template
<
typename
BlockWarps
,
// num warps along seq<M, N>
typename
AccDataType
,
typename
BDataType
,
index_t
kBlockSize
,
typename
BlockWarps
,
// num warps along seq<M, N>
typename
BlockTile
,
// block size, seq<M, N>
typename
BlockTile
,
// block size, seq<M, N>
typename
WarpTile
,
// warp size, seq<M, N>
typename
WarpTile
,
// warp size, seq<M, N>
typename
ThreadTile
>
// contiguous pixels(vector size) along seq<M, N>
typename
Vector
>
// contiguous pixels(vector size) along seq<M, N>
struct
Reduce
struct
Reduce
2dShape
{
{
static
constexpr
index_t
Block_M
=
BlockTile
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Block_M
=
BlockTile
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Block_N
=
BlockTile
::
at
(
number
<
1
>
{});
static
constexpr
index_t
Block_N
=
BlockTile
::
at
(
number
<
1
>
{});
...
@@ -26,93 +22,143 @@ struct Reduce
...
@@ -26,93 +22,143 @@ struct Reduce
static
constexpr
index_t
Warp_M
=
WarpTile
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Warp_M
=
WarpTile
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Warp_N
=
WarpTile
::
at
(
number
<
1
>
{});
static
constexpr
index_t
Warp_N
=
WarpTile
::
at
(
number
<
1
>
{});
static
constexpr
index_t
Thread_M
=
ThreadTile
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Vector_M
=
Vector
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Thread_N
=
ThreadTile
::
at
(
number
<
1
>
{});
static
constexpr
index_t
Vector_N
=
Vector
::
at
(
number
<
1
>
{});
static
constexpr
index_t
WarpPerBlock_M
=
BlockWarps
::
at
(
number
<
0
>
{});
static
constexpr
index_t
WarpPerBlock_M
=
BlockWarps
::
at
(
number
<
0
>
{});
static
constexpr
index_t
WarpPerBlock_N
=
BlockWarps
::
at
(
number
<
1
>
{});
static
constexpr
index_t
WarpPerBlock_N
=
BlockWarps
::
at
(
number
<
1
>
{});
static
constexpr
index_t
ThreadPerWarp_M
=
Warp_M
/
Thread
_M
;
static
constexpr
index_t
ThreadPerWarp_M
=
Warp_M
/
Vector
_M
;
static
constexpr
index_t
ThreadPerWarp_N
=
Warp_N
/
Thread
_N
;
static
constexpr
index_t
ThreadPerWarp_N
=
Warp_N
/
Vector
_N
;
static
constexpr
index_t
Repeat_M
=
Block_M
/
(
WarpPerBlock_M
*
Warp_M
);
static
constexpr
index_t
Repeat_M
=
Block_M
/
(
WarpPerBlock_M
*
Warp_M
);
static
constexpr
index_t
Repeat_N
=
Block_N
/
(
WarpPerBlock_N
*
Warp_N
);
static
constexpr
index_t
Repeat_N
=
Block_N
/
(
WarpPerBlock_N
*
Warp_N
);
__device__
static
constexpr
auto
MakeABlockTileDistribution
()
static
constexpr
index_t
BlockSize
=
{
warpSize
*
reduce_on_sequence
(
BlockWarps
{},
multiplies
{},
number
<
1
>
{});
return
make_static_tile_distribution
(
};
tile_distribution_encoding
<
sequence
<>
,
template
<
typename
XDataType_
,
tuple
<
sequence
<
Repeat_M
,
WarpPerBlock_M
,
ThreadPerWarp_M
,
Thread_M
>
,
typename
ComputeDataType_
,
sequence
<
Repeat_N
,
WarpPerBlock_N
,
ThreadPerWarp_N
,
Thread_N
>>
,
typename
YDataType_
,
tuple
<
sequence
<
1
,
2
>
,
sequence
<
1
,
2
>>
,
typename
BlockShape_
,
tuple
<
sequence
<
1
,
1
>
,
sequence
<
2
,
2
>>
,
typename
ReduceOp_
>
sequence
<
1
,
1
,
2
,
2
>
,
struct
Reduce2dProblem
sequence
<
0
,
3
,
0
,
3
>>
{});
{
}
using
XDataType
=
remove_cvref_t
<
XDataType_
>
;
using
ComputeDataType
=
remove_cvref_t
<
ComputeDataType_
>
;
using
YDataType
=
remove_cvref_t
<
YDataType_
>
;
using
BlockShape
=
remove_cvref_t
<
BlockShape_
>
;
using
ReduceOp
=
ReduceOp_
;
static
constexpr
bool
kNeedCrossLaneSync
=
BlockShape
::
ThreadPerWarp_N
>
1
;
static
constexpr
bool
kNeedCrossWarpSync
=
BlockShape
::
WarpPerBlock_N
>
1
;
};
template
<
typename
Problem_
,
typename
Policy_
=
BlockReduce2dDefaultPolicy
>
struct
Reduce
{
using
Problem
=
ck_tile
::
remove_cvref_t
<
Problem_
>
;
using
Policy
=
ck_tile
::
remove_cvref_t
<
Policy_
>
;
using
XDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
XDataType
>
;
using
ComputeDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
ComputeDataType
>
;
using
YDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
YDataType
>
;
__device__
void
operator
()(
const
ADataType
*
p_a
,
BDataType
*
p_b
,
index_t
M
,
index_t
N
)
const
#if 0
CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N)
const
{
{
const
auto
a_m_n
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
using S = typename Problem::BlockShape;
p_a
,
make_tuple
(
M
,
N
),
make_tuple
(
N
,
1
),
number
<
Thread_N
>
{},
number
<
1
>
{});
const
auto
iM
=
get_block_id
()
*
Block_M
;
const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
// A window
const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
auto
a_block_window
=
make_tile_window
(
a_m_n
,
p_y, make_tuple(M), number<1>{});
make_tuple
(
number
<
Block_M
>
{},
number
<
Block_N
>
{}),
{
iM
,
0
},
const auto iM = get_block_id() * S::Block_M;
MakeABlockTileDistribution
());
auto x_window = make_tile_window(x_m_n,
make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
{iM, 0},
Policy::template MakeXBlockTileDistribution<Problem>());
auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; };
const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; };
const
A
DataType
reduce_init_value
=
0
;
const
X
DataType reduce_init_value = 0;
constexpr auto reduce_dims = sequence<1>{};
constexpr auto reduce_dims = sequence<1>{};
// Acc tile
auto y_compute = decltype(block_tile_reduce<ComputeDataType>(
// TODO: support cross warp reduction
load_tile(x_window), reduce_dims, f_reduce, reduce_init_value)){};
auto
acc_block_tensor
=
decltype
(
block_tile_reduce
<
AccDataType
>
(
load_tile
(
a_block_window
),
reduce_dims
,
f_reduce
,
reduce_init_value
)){};
// init Acc tile
set_tile(y_compute, reduce_init_value);
tile_elementwise_inout
(
[
&
](
auto
&
acc
)
{
acc
=
type_convert
<
AccDataType
>
(
reduce_init_value
);
},
acc_block_tensor
);
// loop
index_t num_n_tile_iteration =
index_t
iN
=
0
;
__builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N))
;
do
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
{
{
const
auto
a_block_tensor
=
load_tile
(
a_block_window
);
const auto x = load_tile(x_window);
block_tile_reduce(y_compute, x, reduce_dims, f_reduce);
move_tile_window(x_window, {0, S::Block_N});
}
// FIXME: support cross warp reduction
block_tile_reduce_sync(y_compute, f_reduce);
block_tile_reduce
(
acc_block_tensor
,
a_block_tensor
,
reduce_dims
,
f_reduce
);
store_tile(y_window, cast_tile<YDataType>(y_compute));
}
#else
CK_TILE_DEVICE
void
operator
()(
const
XDataType
*
p_x
,
YDataType
*
p_y
,
index_t
M
,
index_t
N
)
const
{
using
S
=
typename
Problem
::
BlockShape
;
move_tile_window
(
a_block_window
,
{
0
,
Block_N
});
const
auto
x_m_n
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
p_x
,
make_tuple
(
M
,
N
),
make_tuple
(
N
,
1
),
number
<
S
::
Vector_N
>
{},
number
<
1
>
{});
iN
+=
Block_N
;
const
auto
y_m
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
p_y
,
make_tuple
(
M
),
number
<
1
>
{});
}
while
(
iN
<
N
)
;
const
auto
iM
=
get_block_id
()
*
S
::
Block_M
;
// FIXME: support cross warp reduction
auto
x_window
=
make_tile_window
(
x_m_n
,
block_tile_reduce_sync
(
acc_block_tensor
,
f_reduce
);
make_tuple
(
number
<
S
::
Block_M
>
{},
number
<
S
::
Block_N
>
{}),
{
iM
,
0
},
Policy
::
template
MakeXBlockTileDistribution
<
Problem
>());
// convert acc_block_tensor to b_block_tensor
auto
y_window
=
make_tile_window
(
y_m
,
make_tuple
(
number
<
S
::
Block_M
>
{}),
{
iM
});
const
auto
b_block_tensor
=
tile_elementwise_in
(
[](
const
auto
&
acc
)
{
return
type_convert
<
BDataType
>
(
acc
);
},
acc_block_tensor
);
// B
__shared__
char
smem
[
Policy
::
template
GetSmemSize
<
Problem
>()];
const
auto
b_m
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
p_b
,
make_tuple
(
M
),
number
<
32
>
{});
index_t
num_n_tile_iteration
=
__builtin_amdgcn_readfirstlane
(
integer_divide_ceil
(
N
,
S
::
Block_N
));
auto
reduce_func
=
typename
Problem
::
ReduceOp
{};
auto
block_reduce2d
=
Policy
::
template
GetBlockReduce2d
<
Problem
>();
auto
block_reduce2d_sync
=
Policy
::
template
GetBlockReduce2dSync
<
Problem
>();
auto
block_reduce2d_cross_warp_sync
=
Policy
::
template
GetBlockReduce2dCrossWarpSync
<
Problem
>();
using
XTensorType
=
decltype
(
load_tile
(
x_window
));
auto
y_compute
=
block_reduce2d
.
template
MakeYBlockTile
<
XTensorType
>();
set_tile
(
y_compute
,
reduce_func
.
template
GetIdentityValue
<
ComputeDataType
>());
for
(
int
iN
=
__builtin_amdgcn_readfirstlane
(
0
);
iN
<
num_n_tile_iteration
;
++
iN
)
{
const
auto
x
=
load_tile
(
x_window
);
block_reduce2d
(
x
,
y_compute
,
reduce_func
);
move_tile_window
(
x_window
,
{
0
,
S
::
Block_N
});
}
// B window
block_reduce2d_sync
(
y_compute
,
reduce_func
);
auto
b_block_window
=
make_tile_window
(
b_m
,
make_tuple
(
number
<
Block_M
>
{}),
{
iM
}
);
block_reduce2d_cross_warp_sync
(
y_compute
,
smem
,
reduce_func
);
// store B tile
store_tile
(
y_window
,
cast_tile
<
YDataType
>
(
y_compute
));
store_tile
(
b_block_window
,
b_block_tensor
);
}
}
#endif
};
};
}
// namespace ck_tile
}
// namespace ck_tile
example/ck_tile/10_rmsnorm2d/CMakeLists.txt
0 → 100644
View file @
7fb9b2b6
set
(
TILE_RMSNORM2D_FWD
"tile_rmsnorm2d_fwd"
)
# not using add_example_executable() to add this target, since we don't want this to have
# to be included in "make all/install/check"
message
(
"adding
${
TILE_RMSNORM2D_FWD
}
"
)
file
(
GLOB INSTANCE_SRCS instances/*.cpp
)
add_executable
(
${
TILE_RMSNORM2D_FWD
}
EXCLUDE_FROM_ALL rmsnorm2d_fwd.cpp
)
target_include_directories
(
${
TILE_RMSNORM2D_FWD
}
PRIVATE
${
CMAKE_CURRENT_LIST_DIR
}
)
target_sources
(
${
TILE_RMSNORM2D_FWD
}
PRIVATE
${
INSTANCE_SRCS
}
)
set
(
TILE_RMSNORM2D_FWD_COMPILE_OPTIONS
)
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
list
(
APPEND TILE_RMSNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal
)
target_compile_options
(
${
TILE_RMSNORM2D_FWD
}
PRIVATE
${
TILE_RMSNORM2D_FWD_COMPILE_OPTIONS
}
)
set
(
EXAMPLE_RMSNORM2D_FWD
"tile_example_rmsnorm2d_fwd"
)
add_executable
(
${
EXAMPLE_RMSNORM2D_FWD
}
EXCLUDE_FROM_ALL example_rmsnorm2d_fwd.cpp
)
target_compile_options
(
${
EXAMPLE_RMSNORM2D_FWD
}
PRIVATE
${
TILE_RMSNORM2D_FWD_COMPILE_OPTIONS
}
)
# TODO: we have to turn off this global prop, otherwise the progress bar generated
# by cmake will print too many files, execvp: /bin/sh: Argument list too long
# however, this property may affect global
# TODO: consider codegen a makefile by us
set_property
(
GLOBAL PROPERTY RULE_MESSAGES OFF
)
example/ck_tile/10_rmsnorm2d/README.md
0 → 100644
View file @
7fb9b2b6
# Rmsnorm2D forward
This folder contains example for Rmsnorm2D forward using ck_tile tile-programming implementation.
## build
```
# in the root of ck_tile
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
make tile_rmsnorm2d_fwd -j
```
This will result in an executable
`build/bin/tile_rmsnorm2d_fwd`
## cmdline
```
args:
-m m dimension (default:3328)
-n m dimension (default:4096)
-e epsilon (default:1e-5)
-v cpu validation or not (default:1)
-prec precision (default:fp16)
```
example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
0 → 100644
View file @
7fb9b2b6
#include "ck_tile/host.hpp"
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/rmsnorm2d.hpp"
#include <cstring>
auto
create_args
(
int
argc
,
char
*
argv
[])
{
ck_tile
::
ArgParser
arg_parser
;
arg_parser
.
insert
(
"m"
,
"3328"
,
"m dimension"
)
.
insert
(
"n"
,
"4096"
,
"n dimension"
)
.
insert
(
"stride"
,
"-1"
,
"stride per row, if -1 then equal to n"
)
.
insert
(
"e"
,
"1e-5"
,
"epsilon"
)
.
insert
(
"v"
,
"1"
,
"cpu validation or not"
)
.
insert
(
"prec"
,
"fp16"
,
"precision"
)
.
insert
(
"warmup"
,
"0"
,
"cold iter"
)
.
insert
(
"repeat"
,
"1"
,
"hot iter"
);
bool
result
=
arg_parser
.
parse
(
argc
,
argv
);
return
std
::
make_tuple
(
result
,
arg_parser
);
}
template
<
typename
DataType
>
bool
run
(
const
ck_tile
::
ArgParser
&
arg_parser
)
{
ck_tile
::
index_t
m
=
arg_parser
.
get_int
(
"m"
);
ck_tile
::
index_t
n
=
arg_parser
.
get_int
(
"n"
);
ck_tile
::
index_t
stride
=
arg_parser
.
get_int
(
"stride"
);
if
(
stride
<
0
)
stride
=
n
;
float
epsilon
=
arg_parser
.
get_float
(
"e"
);
std
::
string
data_type
=
arg_parser
.
get_str
(
"prec"
);
int
do_validation
=
arg_parser
.
get_int
(
"v"
);
int
warmup
=
arg_parser
.
get_int
(
"warmup"
);
int
repeat
=
arg_parser
.
get_int
(
"repeat"
);
assert
(
stride
>=
n
);
using
XDataType
=
DataType
;
using
YDataType
=
DataType
;
using
GammaDataType
=
DataType
;
using
InvRmsDataType
=
ck_tile
::
null_type
;
using
ComputeDataType
=
float
;
// host verify
ck_tile
::
HostTensor
<
XDataType
>
x_host
({
m
,
n
},
{
stride
,
1
});
ck_tile
::
HostTensor
<
GammaDataType
>
gamma_host
({
n
});
ck_tile
::
HostTensor
<
YDataType
>
y_host_ref
({
m
,
n
},
{
stride
,
1
});
ck_tile
::
HostTensor
<
YDataType
>
y_host_dev
({
m
,
n
},
{
stride
,
1
});
ck_tile
::
HostTensor
<
InvRmsDataType
>
invRms_host_ref
({
m
});
ck_tile
::
FillUniformDistribution
<
XDataType
>
{
-
.5
f
,
.5
f
}(
x_host
);
ck_tile
::
FillUniformDistribution
<
GammaDataType
>
{
-
.5
f
,
.5
f
}(
gamma_host
);
ck_tile
::
DeviceMem
x_buf
(
x_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
gamma_buf
(
gamma_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
y_buf
(
y_host_dev
.
get_element_space_size_in_bytes
());
x_buf
.
ToDevice
(
x_host
.
data
());
gamma_buf
.
ToDevice
(
gamma_host
.
data
());
constexpr
bool
kTwoPass
=
true
;
using
BlockWarps
=
ck_tile
::
sequence
<
2
,
2
>
;
using
BlockTile
=
ck_tile
::
sequence
<
2
,
128
>
;
using
WarpTile
=
ck_tile
::
sequence
<
1
,
64
>
;
using
Vector
=
ck_tile
::
sequence
<
1
,
1
>
;
using
Shape
=
ck_tile
::
Rmsnorm2dShape
<
BlockTile
,
BlockWarps
,
WarpTile
,
Vector
>
;
using
Problem
=
ck_tile
::
Rmsnorm2dFwdPipelineProblem
<
XDataType
,
GammaDataType
,
ComputeDataType
,
YDataType
,
InvRmsDataType
,
Shape
,
true
,
// kPadN
false
,
// kSaveInvRms
kTwoPass
>
;
using
OnePassPipeline
=
ck_tile
::
Rmsnorm2dFwdPipelineOnePass
<
Problem
>
;
using
TwoPassPipeline
=
ck_tile
::
Rmsnorm2dFwdPipelineTwoPass
<
Problem
>
;
using
Pipeline
=
std
::
conditional_t
<
kTwoPass
,
TwoPassPipeline
,
OnePassPipeline
>
;
using
Kernel
=
ck_tile
::
Rmsnorm2dFwd
<
Pipeline
>
;
ck_tile
::
Rmsnorm2dFwdHostArgs
args
{
x_buf
.
GetDeviceBuffer
(),
gamma_buf
.
GetDeviceBuffer
(),
y_buf
.
GetDeviceBuffer
(),
nullptr
,
epsilon
,
m
,
n
,
stride
};
auto
kargs
=
Kernel
::
MakeKargs
(
args
);
const
dim3
grids
=
Kernel
::
GridSize
(
args
);
constexpr
dim3
blocks
=
Kernel
::
BlockSize
();
constexpr
ck_tile
::
index_t
kBlockPerCu
=
1
;
auto
s
=
ck_tile
::
stream_config
{
nullptr
,
true
,
0
,
warmup
,
repeat
};
ck_tile
::
launch_kernel
(
s
,
ck_tile
::
make_kernel
<
blocks
.
x
,
kBlockPerCu
>
(
Kernel
{},
grids
,
blocks
,
0
,
kargs
));
bool
pass
=
true
;
if
(
do_validation
)
{
// reference
ck_tile
::
reference_rmsnorm2d_fwd
<
XDataType
,
GammaDataType
,
ComputeDataType
,
YDataType
,
InvRmsDataType
>
(
x_host
,
gamma_host
,
y_host_ref
,
invRms_host_ref
,
epsilon
);
y_buf
.
FromDevice
(
y_host_dev
.
data
());
auto
[
rtol
,
atol
]
=
ck_tile
::
make_tuple
(
1e-3
,
1e-3
);
if
(
stride
==
n
)
{
pass
=
ck_tile
::
check_err
(
y_host_dev
,
y_host_ref
,
std
::
string
(
"OUT Error: Incorrect results!"
),
rtol
,
atol
);
}
else
{
for
(
int
i_r
=
0
;
i_r
<
m
;
i_r
++
)
{
std
::
vector
<
YDataType
>
y_host_dev_row
(
y_host_dev
.
begin
()
+
i_r
*
stride
,
y_host_dev
.
begin
()
+
i_r
*
stride
+
n
);
std
::
vector
<
YDataType
>
y_host_ref_row
(
y_host_ref
.
begin
()
+
i_r
*
stride
,
y_host_ref
.
begin
()
+
i_r
*
stride
+
n
);
pass
&=
ck_tile
::
check_err
(
y_host_dev_row
,
y_host_ref_row
,
std
::
string
(
"OUT["
)
+
std
::
to_string
(
i_r
)
+
std
::
string
(
"] Error: Incorrect results!"
),
rtol
,
atol
);
}
}
std
::
cout
<<
"["
<<
data_type
<<
"]"
<<
" m:"
<<
m
<<
", n:"
<<
n
<<
", stride:"
<<
stride
<<
", valid:"
<<
(
pass
?
"y"
:
"n"
)
<<
std
::
flush
<<
std
::
endl
;
}
return
pass
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
auto
[
result
,
arg_parser
]
=
create_args
(
argc
,
argv
);
if
(
!
result
)
return
-
1
;
const
std
::
string
data_type
=
arg_parser
.
get_str
(
"prec"
);
if
(
data_type
==
"fp16"
)
{
return
run
<
ck_tile
::
half_t
>
(
arg_parser
)
?
0
:
-
2
;
}
return
-
3
;
}
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_api.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <ck_tile/core.hpp>
#include "rmsnorm2d_fwd.hpp"
template
<
typename
DataType_
,
ck_tile
::
index_t
Repeat_M_
,
// each thread repeat along M
ck_tile
::
index_t
Repeat_N_
,
// each thread repeat along N
ck_tile
::
index_t
ThreadPerBlock_M_
,
// num threads along M
ck_tile
::
index_t
ThreadPerBlock_N_
,
// num threads along N
ck_tile
::
index_t
Vector_N_
,
// vector size along N
bool
kPadN_
,
bool
kSaveInvRms_
,
bool
kTwoPass_
>
using
trait_
=
rmsnorm2d_fwd_traits_
<
DataType_
,
Repeat_M_
,
Repeat_N_
,
ThreadPerBlock_M_
,
ThreadPerBlock_N_
,
Vector_N_
,
kPadN_
,
kSaveInvRms_
,
kTwoPass_
>
;
template
<
typename
data_type
>
float
rmsnorm2d_fwd_b16_
(
rmsnorm2d_fwd_traits
/*t*/
,
rmsnorm2d_fwd_args
a
,
const
ck_tile
::
stream_config
&
s
)
{
#if 1
float
r
=
-
1
;
// clang-format off
// rm rn tm tn vn pd rms 2p
if
(
a
.
n
<=
64
)
{
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
1
,
4
,
64
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
128
)
{
if
(
a
.
n
%
2
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
1
,
4
,
64
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
else
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
2
,
4
,
64
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
256
)
{
if
(
a
.
n
%
4
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
1
,
4
,
64
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
2
,
4
,
64
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
else
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
4
,
4
,
64
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
512
)
{
if
(
a
.
n
%
8
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
1
,
4
,
64
,
8
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
2
,
4
,
64
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
4
,
4
,
64
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
else
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
8
,
4
,
64
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
768
)
{
if
(
a
.
n
%
4
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
3
,
4
,
64
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
6
,
4
,
64
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
else
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
12
,
4
,
64
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
1024
)
{
if
(
a
.
n
%
8
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
1
,
2
,
128
,
8
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
2
,
2
,
128
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
4
,
2
,
128
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
else
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
4
,
1
,
256
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
1536
)
{
if
(
a
.
n
%
8
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
3
,
4
,
64
,
8
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
3
,
2
,
128
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
3
,
1
,
256
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
else
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
6
,
1
,
256
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
2048
)
{
if
(
a
.
n
%
8
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
1
,
1
,
256
,
8
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
2
,
1
,
256
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
4
,
1
,
256
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
else
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
8
,
1
,
256
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
3072
)
{
if
(
a
.
n
%
8
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
3
,
1
,
128
,
8
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
3
,
1
,
256
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
6
,
1
,
256
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
else
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
3
,
1
,
1024
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
<=
4096
)
{
if
(
a
.
n
%
8
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
2
,
1
,
256
,
8
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
4
,
1
,
256
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
,
false
>>
(
s
,
a
);
else
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
,
false
>>
(
s
,
a
);
}
else
if
(
a
.
n
>
4096
)
{
if
(
a
.
n
%
8
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
2
,
1
,
256
,
8
,
true
,
false
,
true
>>
(
s
,
a
);
else
if
(
a
.
n
%
4
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
4
,
1
,
256
,
4
,
true
,
false
,
true
>>
(
s
,
a
);
else
if
(
a
.
n
%
2
==
0
)
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
,
true
>>
(
s
,
a
);
else
r
=
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
,
true
>>
(
s
,
a
);
}
return
r
;
#else
return
rmsnorm2d_fwd_
<
trait_
<
data_type
,
1
,
1
,
1
,
256
,
4
,
true
,
false
,
false
>>
(
s
,
a
);
#endif
// clang-format on
}
float
rmsnorm2d_fwd
(
rmsnorm2d_fwd_traits
t
,
rmsnorm2d_fwd_args
a
,
const
ck_tile
::
stream_config
&
s
)
{
float
r
=
-
1
;
if
(
t
.
data_type
.
compare
(
"fp16"
)
==
0
)
{
return
rmsnorm2d_fwd_b16_
<
ck_tile
::
fp16_t
>
(
t
,
a
,
s
);
}
else
if
(
t
.
data_type
.
compare
(
"bf16"
)
==
0
)
{
return
rmsnorm2d_fwd_b16_
<
ck_tile
::
bf16_t
>
(
t
,
a
,
s
);
}
if
(
r
<
0
)
throw
std
::
runtime_error
(
"Without supported instances!"
);
return
r
;
}
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1024_instance.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "rmsnorm2d_fwd_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd rms 2p
#if 0
template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1, 2, 4, 64, 8, true , false, false>>(const S&, A);
template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1, 4, 4, 64, 4, true , false, false>>(const S&, A);
template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1, 8, 4, 64, 2, true , false, false>>(const S&, A);
template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1, 16, 4, 64, 1, true , false, false>>(const S&, A);
template float rmsnorm2d_fwd_<trait_<ck_tile::bf16_t, 1, 1, 1, 256, 4, true , false, false>>(const S&, A);
#endif
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
1
,
2
,
128
,
8
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
2
,
128
,
4
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
2
,
128
,
2
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
1
,
256
,
1
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n1536_instance.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "rmsnorm2d_fwd_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd rms 2p
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
3
,
4
,
64
,
8
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
3
,
2
,
128
,
4
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
3
,
1
,
256
,
2
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
6
,
1
,
256
,
1
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n2048_instance.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "rmsnorm2d_fwd_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd rms 2p
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
1
,
1
,
256
,
8
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
1
,
256
,
4
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
1
,
256
,
2
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
8
,
1
,
256
,
1
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n256_instance.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "rmsnorm2d_fwd_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd rms 2p
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
1
,
4
,
64
,
4
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
4
,
64
,
2
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
4
,
64
,
1
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n3072_instance.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "rmsnorm2d_fwd_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd rms 2p
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
3
,
1
,
128
,
8
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
3
,
1
,
256
,
4
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
6
,
1
,
256
,
2
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
3
,
1
,
1024
,
1
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_instance.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "rmsnorm2d_fwd_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd rms 2p
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
1
,
256
,
8
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
1
,
256
,
4
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n4096_tp_instance.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "rmsnorm2d_fwd_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd rms 2p
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
1
,
256
,
8
,
true
,
false
,
true
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
1
,
256
,
4
,
true
,
false
,
true
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
1
,
1024
,
2
,
true
,
false
,
true
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
1
,
1024
,
1
,
true
,
false
,
true
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n512_instance.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "rmsnorm2d_fwd_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd rms 2p
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
1
,
4
,
64
,
8
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
4
,
64
,
4
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
4
,
4
,
64
,
2
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
8
,
4
,
64
,
1
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n64_n128_instance.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "rmsnorm2d_fwd_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd rms 2p
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
1
,
4
,
64
,
1
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
1
,
4
,
64
,
2
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
2
,
4
,
64
,
1
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_bf16_n768_instance.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "rmsnorm2d_fwd_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd rms 2p
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
3
,
4
,
64
,
4
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
6
,
4
,
64
,
2
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
12
,
4
,
64
,
1
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1024_instance.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "rmsnorm2d_fwd_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd rms 2p
#if 0
template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1, 2, 4, 64, 8, true , false, false>>(const S&, A);
template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1, 4, 4, 64, 4, true , false, false>>(const S&, A);
template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1, 8, 4, 64, 2, true , false, false>>(const S&, A);
template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1, 16, 4, 64, 1, true , false, false>>(const S&, A);
template float rmsnorm2d_fwd_<trait_<ck_tile::fp16_t, 1, 1, 1, 256, 4, true , false, false>>(const S&, A);
#endif
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
1
,
2
,
128
,
8
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
2
,
2
,
128
,
4
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
2
,
128
,
2
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
4
,
1
,
256
,
1
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/10_rmsnorm2d/instances/rmsnorm2d_fwd_fp16_n1536_instance.cpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "rmsnorm2d_fwd_instance_common.hpp"
// clang-format off
// rm rn tm tn vn pd rms 2p
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
3
,
4
,
64
,
8
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
3
,
2
,
128
,
4
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
3
,
1
,
256
,
2
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
template
float
rmsnorm2d_fwd_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
6
,
1
,
256
,
1
,
true
,
false
,
false
>
>
(
const
S
&
,
A
);
// clang-format on
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment