Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
7fb9b2b6
Commit
7fb9b2b6
authored
Oct 30, 2024
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into ck_tile/layernorm_fusion
parents
50f67a66
3d609534
Changes
102
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
983 additions
and
78 deletions
+983
-78
include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
...mha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+12
-11
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
...k_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
+12
-11
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
.../ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+12
-11
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp
...le/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp
+11
-11
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
...k_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
+12
-11
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
...a/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+9
-6
include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
+18
-2
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
...ayernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
+2
-2
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
...ayernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
+2
-4
include/ck_tile/ops/reduce.hpp
include/ck_tile/ops/reduce.hpp
+3
-0
include/ck_tile/ops/reduce/block/block_reduce.hpp
include/ck_tile/ops/reduce/block/block_reduce.hpp
+10
-9
include/ck_tile/ops/reduce/block/block_reduce2d.hpp
include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+260
-0
include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp
...k_tile/ops/reduce/block/block_reduce2d_default_policy.hpp
+79
-0
include/ck_tile/ops/reduce/block/block_reduce2d_problem.hpp
include/ck_tile/ops/reduce/block/block_reduce2d_problem.hpp
+18
-0
include/ck_tile/ops/rmsnorm2d.hpp
include/ck_tile/ops/rmsnorm2d.hpp
+12
-0
include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
...ude/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
+202
-0
include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
+78
-0
include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
...norm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
+94
-0
include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp
...ps/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp
+101
-0
include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp
...ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp
+36
-0
No files found.
include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
View file @
7fb9b2b6
...
@@ -34,12 +34,13 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
...
@@ -34,12 +34,13 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kM0
=
BlockFmhaShape
::
kM0
;
static
constexpr
index_t
kM0
=
BlockFmhaShape
::
kM0
;
static
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
static
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
static
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
static
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
static
constexpr
index_t
kN1
=
BlockFmhaShape
::
kN1
;
static
constexpr
index_t
kN1
=
BlockFmhaShape
::
kN1
;
static
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
static
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
static
constexpr
index_t
kK0BlockLength
=
BlockFmhaShape
::
kK0BlockLength
;
static
constexpr
index_t
kQKHeaddim
=
BlockFmhaShape
::
kQKHeaddim
;
static
constexpr
index_t
kSubQKHeaddim
=
BlockFmhaShape
::
kSubQKHeaddim
;
static
constexpr
bool
kIsGroupMode
=
Problem
::
kIsGroupMode
;
static
constexpr
bool
kIsGroupMode
=
Problem
::
kIsGroupMode
;
static
constexpr
bool
kPadSeqLenQ
=
Problem
::
kPadSeqLenQ
;
static
constexpr
bool
kPadSeqLenQ
=
Problem
::
kPadSeqLenQ
;
...
@@ -75,22 +76,22 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
...
@@ -75,22 +76,22 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
return
Problem
::
kBlockPerCu
;
return
Problem
::
kBlockPerCu
;
else
else
{
{
if
constexpr
(
k
K0BlockLength
<=
32
)
if
constexpr
(
k
QKHeaddim
<=
32
)
{
{
return
2
;
return
2
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
64
)
else
if
constexpr
(
k
QKHeaddim
<=
64
)
{
{
return
3
;
return
3
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
128
)
else
if
constexpr
(
k
QKHeaddim
<=
128
)
{
{
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
return
1
;
return
1
;
else
else
return
2
;
return
2
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
256
)
else
if
constexpr
(
k
QKHeaddim
<=
256
)
{
{
return
1
;
return
1
;
}
}
...
@@ -270,7 +271,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
...
@@ -270,7 +271,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
// prefetch K tile
// prefetch K tile
index_t
i_total_loops
=
0
;
index_t
i_total_loops
=
0
;
constexpr
index_t
k0_loops
=
k
K0BlockLength
/
kK0
;
constexpr
index_t
k0_loops
=
k
QKHeaddim
/
kK0
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
static_assert
(
2
<=
k0_loops
);
static_assert
(
2
<=
k0_loops
);
...
...
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
View file @
7fb9b2b6
...
@@ -37,12 +37,13 @@ struct BlockFmhaPipelineQRKSVS
...
@@ -37,12 +37,13 @@ struct BlockFmhaPipelineQRKSVS
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kM0
=
BlockFmhaShape
::
kM0
;
static
constexpr
index_t
kM0
=
BlockFmhaShape
::
kM0
;
static
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
static
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
static
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
static
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
static
constexpr
index_t
kN1
=
BlockFmhaShape
::
kN1
;
static
constexpr
index_t
kN1
=
BlockFmhaShape
::
kN1
;
static
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
static
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
static
constexpr
index_t
kK0BlockLength
=
BlockFmhaShape
::
kK0BlockLength
;
static
constexpr
index_t
kQKHeaddim
=
BlockFmhaShape
::
kQKHeaddim
;
static
constexpr
index_t
kSubQKHeaddim
=
BlockFmhaShape
::
kSubQKHeaddim
;
static
constexpr
bool
kIsGroupMode
=
Problem
::
kIsGroupMode
;
static
constexpr
bool
kIsGroupMode
=
Problem
::
kIsGroupMode
;
static
constexpr
bool
kPadSeqLenQ
=
Problem
::
kPadSeqLenQ
;
static
constexpr
bool
kPadSeqLenQ
=
Problem
::
kPadSeqLenQ
;
...
@@ -76,22 +77,22 @@ struct BlockFmhaPipelineQRKSVS
...
@@ -76,22 +77,22 @@ struct BlockFmhaPipelineQRKSVS
return
Problem
::
kBlockPerCu
;
return
Problem
::
kBlockPerCu
;
else
else
{
{
if
constexpr
(
k
K0BlockLength
<=
32
)
if
constexpr
(
k
QKHeaddim
<=
32
)
{
{
return
2
;
return
2
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
64
)
else
if
constexpr
(
k
QKHeaddim
<=
64
)
{
{
return
3
;
return
3
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
128
)
else
if
constexpr
(
k
QKHeaddim
<=
128
)
{
{
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
return
1
;
return
1
;
else
else
return
2
;
return
2
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
256
)
else
if
constexpr
(
k
QKHeaddim
<=
256
)
{
{
return
1
;
return
1
;
}
}
...
@@ -261,7 +262,7 @@ struct BlockFmhaPipelineQRKSVS
...
@@ -261,7 +262,7 @@ struct BlockFmhaPipelineQRKSVS
// prefetch K tile
// prefetch K tile
index_t
i_total_loops
=
0
;
index_t
i_total_loops
=
0
;
constexpr
index_t
k0_loops
=
k
K0BlockLength
/
kK0
;
constexpr
index_t
k0_loops
=
k
QKHeaddim
/
kK0
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
static_assert
(
2
<=
k0_loops
);
static_assert
(
2
<=
k0_loops
);
...
...
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
View file @
7fb9b2b6
...
@@ -38,12 +38,13 @@ struct BlockFmhaPipelineQRKSVSAsync
...
@@ -38,12 +38,13 @@ struct BlockFmhaPipelineQRKSVSAsync
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kM0
=
BlockFmhaShape
::
kM0
;
static
constexpr
index_t
kM0
=
BlockFmhaShape
::
kM0
;
static
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
static
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
static
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
static
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
static
constexpr
index_t
kN1
=
BlockFmhaShape
::
kN1
;
static
constexpr
index_t
kN1
=
BlockFmhaShape
::
kN1
;
static
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
static
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
static
constexpr
index_t
kK0BlockLength
=
BlockFmhaShape
::
kK0BlockLength
;
static
constexpr
index_t
kQKHeaddim
=
BlockFmhaShape
::
kQKHeaddim
;
static
constexpr
index_t
kSubQKHeaddim
=
BlockFmhaShape
::
kSubQKHeaddim
;
static
constexpr
bool
kIsGroupMode
=
Problem
::
kIsGroupMode
;
static
constexpr
bool
kIsGroupMode
=
Problem
::
kIsGroupMode
;
// TODO: seq_q always support padding, hdim_q/v support multiple of vector(like 8x)
// TODO: seq_q always support padding, hdim_q/v support multiple of vector(like 8x)
...
@@ -87,7 +88,7 @@ struct BlockFmhaPipelineQRKSVSAsync
...
@@ -87,7 +88,7 @@ struct BlockFmhaPipelineQRKSVSAsync
return
1
;
return
1
;
}
}
if
constexpr
(
k
K0BlockLength
<=
32
)
if
constexpr
(
k
QKHeaddim
<=
32
)
{
{
if
constexpr
(
kPadSeqLenK
&&
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
&&
if
constexpr
(
kPadSeqLenK
&&
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
&&
FmhaMask
::
IsMasking
)
FmhaMask
::
IsMasking
)
...
@@ -95,21 +96,21 @@ struct BlockFmhaPipelineQRKSVSAsync
...
@@ -95,21 +96,21 @@ struct BlockFmhaPipelineQRKSVSAsync
else
else
return
2
;
return
2
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
64
)
else
if
constexpr
(
k
QKHeaddim
<=
64
)
{
{
if
constexpr
(
kPadSeqLenK
&&
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
if
constexpr
(
kPadSeqLenK
&&
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
return
2
;
return
2
;
else
else
return
3
;
return
3
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
128
)
else
if
constexpr
(
k
QKHeaddim
<=
128
)
{
{
if
constexpr
(
kPadSeqLenK
&&
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
if
constexpr
(
kPadSeqLenK
&&
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
return
1
;
return
1
;
else
else
return
2
;
return
2
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
256
)
else
if
constexpr
(
k
QKHeaddim
<=
256
)
{
{
return
1
;
return
1
;
}
}
...
@@ -339,7 +340,7 @@ struct BlockFmhaPipelineQRKSVSAsync
...
@@ -339,7 +340,7 @@ struct BlockFmhaPipelineQRKSVSAsync
// auto q_tile = q; // tile_elementwise_in(q_element_func, q);
// auto q_tile = q; // tile_elementwise_in(q_element_func, q);
index_t
i_total_loops
=
0
;
index_t
i_total_loops
=
0
;
constexpr
index_t
k0_loops
=
k
K0BlockLength
/
kK0
;
constexpr
index_t
k0_loops
=
k
QKHeaddim
/
kK0
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
static_assert
(
1
<=
k0_loops
);
static_assert
(
1
<=
k0_loops
);
...
...
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp
View file @
7fb9b2b6
...
@@ -36,12 +36,12 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8
...
@@ -36,12 +36,12 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kM0
=
BlockFmhaShape
::
kM0
;
static
constexpr
index_t
kM0
=
BlockFmhaShape
::
kM0
;
static
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
static
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
static
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
static
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
static
constexpr
index_t
kN1
=
BlockFmhaShape
::
kN1
;
static
constexpr
index_t
kN1
=
BlockFmhaShape
::
kN1
;
static
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
static
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
static
constexpr
index_t
k
K0BlockLength
=
BlockFmhaShape
::
k
K0BlockLength
;
static
constexpr
index_t
k
QKHeaddim
=
BlockFmhaShape
::
k
QKHeaddim
;
static
constexpr
bool
kIsGroupMode
=
Problem
::
kIsGroupMode
;
static
constexpr
bool
kIsGroupMode
=
Problem
::
kIsGroupMode
;
static
constexpr
bool
kPadSeqLenQ
=
Problem
::
kPadSeqLenQ
;
static
constexpr
bool
kPadSeqLenQ
=
Problem
::
kPadSeqLenQ
;
...
@@ -75,22 +75,22 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8
...
@@ -75,22 +75,22 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8
return
Problem
::
kBlockPerCu
;
return
Problem
::
kBlockPerCu
;
else
else
{
{
if
constexpr
(
k
K0BlockLength
<=
32
)
if
constexpr
(
k
QKHeaddim
<=
32
)
{
{
return
2
;
return
2
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
64
)
else
if
constexpr
(
k
QKHeaddim
<=
64
)
{
{
return
3
;
return
3
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
128
)
else
if
constexpr
(
k
QKHeaddim
<=
128
)
{
{
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
return
1
;
return
1
;
else
else
return
2
;
return
2
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
256
)
else
if
constexpr
(
k
QKHeaddim
<=
256
)
{
{
return
1
;
return
1
;
}
}
...
@@ -232,7 +232,7 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8
...
@@ -232,7 +232,7 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8
// prefetch K tile
// prefetch K tile
index_t
i_total_loops
=
0
;
index_t
i_total_loops
=
0
;
constexpr
index_t
k0_loops
=
k
K0BlockLength
/
kK0
;
constexpr
index_t
k0_loops
=
k
QKHeaddim
/
kK0
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
static_assert
(
2
<=
k0_loops
);
static_assert
(
2
<=
k0_loops
);
...
...
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
View file @
7fb9b2b6
...
@@ -36,12 +36,13 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS
...
@@ -36,12 +36,13 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kBlockSize
=
Problem
::
kBlockSize
;
static
constexpr
index_t
kM0
=
BlockFmhaShape
::
kM0
;
static
constexpr
index_t
kM0
=
BlockFmhaShape
::
kM0
;
static
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
static
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
static
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
static
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
static
constexpr
index_t
kN1
=
BlockFmhaShape
::
kN1
;
static
constexpr
index_t
kN1
=
BlockFmhaShape
::
kN1
;
static
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
static
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
static
constexpr
index_t
kK0BlockLength
=
BlockFmhaShape
::
kK0BlockLength
;
static
constexpr
index_t
kQKHeaddim
=
BlockFmhaShape
::
kQKHeaddim
;
static
constexpr
index_t
kSubQKHeaddim
=
BlockFmhaShape
::
kSubQKHeaddim
;
static
constexpr
bool
kIsGroupMode
=
Problem
::
kIsGroupMode
;
static
constexpr
bool
kIsGroupMode
=
Problem
::
kIsGroupMode
;
static
constexpr
bool
kPadSeqLenQ
=
Problem
::
kPadSeqLenQ
;
static
constexpr
bool
kPadSeqLenQ
=
Problem
::
kPadSeqLenQ
;
...
@@ -56,22 +57,22 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS
...
@@ -56,22 +57,22 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS
return
Problem
::
kBlockPerCu
;
return
Problem
::
kBlockPerCu
;
else
else
{
{
if
constexpr
(
k
K0BlockLength
<=
32
)
if
constexpr
(
k
QKHeaddim
<=
32
)
{
{
return
2
;
return
2
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
64
)
else
if
constexpr
(
k
QKHeaddim
<=
64
)
{
{
return
3
;
return
3
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
128
)
else
if
constexpr
(
k
QKHeaddim
<=
128
)
{
{
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
if
constexpr
(
BiasEnum
==
BlockAttentionBiasEnum
::
ELEMENTWISE_BIAS
)
return
1
;
return
1
;
else
else
return
2
;
return
2
;
}
}
else
if
constexpr
(
k
K0BlockLength
<=
256
)
else
if
constexpr
(
k
QKHeaddim
<=
256
)
{
{
return
1
;
return
1
;
}
}
...
@@ -235,7 +236,7 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS
...
@@ -235,7 +236,7 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS
// prefetch K tile
// prefetch K tile
index_t
i_total_loops
=
0
;
index_t
i_total_loops
=
0
;
constexpr
index_t
k0_loops
=
k
K0BlockLength
/
kK0
;
constexpr
index_t
k0_loops
=
k
QKHeaddim
/
kK0
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
static_assert
(
2
<=
k0_loops
);
static_assert
(
2
<=
k0_loops
);
...
...
include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
View file @
7fb9b2b6
...
@@ -55,7 +55,7 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
...
@@ -55,7 +55,7 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
constexpr
index_t
MWarp
=
config
.
template
at
<
1
>();
constexpr
index_t
MWarp
=
config
.
template
at
<
1
>();
constexpr
index_t
kMPerBlock
=
Problem
::
BlockFmhaShape
::
kM0
;
constexpr
index_t
kMPerBlock
=
Problem
::
BlockFmhaShape
::
kM0
;
constexpr
index_t
kKPerBlock
=
Problem
::
BlockFmhaShape
::
k
K0BlockLength
;
constexpr
index_t
kKPerBlock
=
Problem
::
BlockFmhaShape
::
k
SubQKHeaddim
;
constexpr
index_t
K2
=
WG
::
kK
/
WG
::
WarpGemmAttribute
::
Impl
::
kABKLane
;
constexpr
index_t
K2
=
WG
::
kK
/
WG
::
WarpGemmAttribute
::
Impl
::
kABKLane
;
constexpr
index_t
K1
=
WG
::
WarpGemmAttribute
::
Impl
::
kABKLane
;
constexpr
index_t
K1
=
WG
::
WarpGemmAttribute
::
Impl
::
kABKLane
;
...
@@ -323,6 +323,9 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
...
@@ -323,6 +323,9 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
template
<
>
struct
template
<
>
struct
LdsBufferSequence
<
3
,
3
,
3
,
3
>
{
using
type
=
sequence
<
1
,
2
,
0
,
1
,
2
,
0
>
;
};
LdsBufferSequence
<
3
,
3
,
3
,
3
>
{
using
type
=
sequence
<
1
,
2
,
0
,
1
,
2
,
0
>
;
};
template
<
>
struct
LdsBufferSequence
<
3
,
3
,
3
,
4
>
{
using
type
=
sequence
<
1
,
2
,
0
,
0
,
1
,
2
,
0
>
;
};
template
<
>
struct
template
<
>
struct
LdsBufferSequence
<
3
,
3
,
2
,
2
>
{
using
type
=
sequence
<
1
,
2
,
1
,
0
>
;};
LdsBufferSequence
<
3
,
3
,
2
,
2
>
{
using
type
=
sequence
<
1
,
2
,
1
,
0
>
;};
// clang-format on
// clang-format on
...
@@ -332,12 +335,12 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
...
@@ -332,12 +335,12 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
{
{
using
BlockFmhaShape
=
remove_cvref_t
<
typename
Problem
::
BlockFmhaShape
>
;
using
BlockFmhaShape
=
remove_cvref_t
<
typename
Problem
::
BlockFmhaShape
>
;
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
constexpr
index_t
kN0
=
BlockFmhaShape
::
kN0
;
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
constexpr
index_t
kK0
=
BlockFmhaShape
::
kK0
;
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
constexpr
index_t
kK1
=
BlockFmhaShape
::
kK1
;
constexpr
index_t
k
K0BlockLength
=
BlockFmhaShape
::
k
K0BlockLength
;
constexpr
index_t
k
QKHeaddim
=
BlockFmhaShape
::
k
QKHeaddim
;
constexpr
index_t
k0_loops
=
k
K0BlockLength
/
kK0
;
constexpr
index_t
k0_loops
=
k
QKHeaddim
/
kK0
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
constexpr
index_t
k1_loops
=
kN0
/
kK1
;
return
typename
LdsBufferSequence
<
NumPrefetchK
,
NumPrefetchV
,
k0_loops
,
k1_loops
>::
type
{};
return
typename
LdsBufferSequence
<
NumPrefetchK
,
NumPrefetchV
,
k0_loops
,
k1_loops
>::
type
{};
...
...
include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
View file @
7fb9b2b6
...
@@ -7,6 +7,20 @@
...
@@ -7,6 +7,20 @@
namespace
ck_tile
{
namespace
ck_tile
{
static
CK_TILE_HOST_DEVICE
constexpr
index_t
ceil_to_qualified_tile_length
(
index_t
len
)
{
if
(
len
==
96
)
return
128
;
if
(
len
==
160
)
return
256
;
// only length of 96, 160 and power-of-two is supported
if
(
!
(
len
&
(
len
-
1
)))
return
len
;
return
0
;
};
template
<
typename
BlockTile_
,
// sequence<...
template
<
typename
BlockTile_
,
// sequence<...
typename
Gemm0BlockWarps_
,
typename
Gemm0BlockWarps_
,
typename
Gemm0WarpTile_
,
typename
Gemm0WarpTile_
,
...
@@ -36,10 +50,12 @@ struct TileFmhaShape
...
@@ -36,10 +50,12 @@ struct TileFmhaShape
static
constexpr
index_t
kK0
=
BlockTile
::
at
(
number
<
2
>
{});
// tile size along qk gemm unroll
static
constexpr
index_t
kK0
=
BlockTile
::
at
(
number
<
2
>
{});
// tile size along qk gemm unroll
static
constexpr
index_t
kN1
=
BlockTile
::
at
(
number
<
3
>
{});
// tile size along v head_dim
static
constexpr
index_t
kN1
=
BlockTile
::
at
(
number
<
3
>
{});
// tile size along v head_dim
static
constexpr
index_t
kK1
=
BlockTile
::
at
(
number
<
4
>
{});
// tile size along kv gemm unroll
static
constexpr
index_t
kK1
=
BlockTile
::
at
(
number
<
4
>
{});
// tile size along kv gemm unroll
static
constexpr
index_t
k
K0BlockLength
=
static
constexpr
index_t
k
QKHeaddim
=
BlockTile
::
at
(
number
<
5
>
{});
// total length of K0, used for pipeline that need load Q at
BlockTile
::
at
(
number
<
5
>
{});
// total length of K0, used for pipeline that need load Q at
// once (or repeately load Q as a whole tile)
// once (or repeately load Q as a whole tile)
static_assert
(
kK0BlockLength
%
kK0
==
0
,
"kK0BlockLength should be divisible by kK0"
);
static_assert
(
kQKHeaddim
%
kK0
==
0
,
"kQKHeaddim should be divisible by kK0"
);
static
constexpr
index_t
kSubQKHeaddim
=
ceil_to_qualified_tile_length
(
kQKHeaddim
);
// v, rowmajor : seqlen*hdim, colmajor : hdim*seqlen
// v, rowmajor : seqlen*hdim, colmajor : hdim*seqlen
static
constexpr
bool
IsVLayoutRowMajor
=
IsVLayoutRowMajor_
;
static
constexpr
bool
IsVLayoutRowMajor
=
IsVLayoutRowMajor_
;
...
...
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
View file @
7fb9b2b6
...
@@ -41,9 +41,9 @@ struct Layernorm2dFwdPipelineOnePass
...
@@ -41,9 +41,9 @@ struct Layernorm2dFwdPipelineOnePass
static
constexpr
const
char
*
name
=
[]()
{
static
constexpr
const
char
*
name
=
[]()
{
if
constexpr
(
kNeedCrossWarpSync
)
if
constexpr
(
kNeedCrossWarpSync
)
return
"bpr"
;
// block per row
return
"bpr
_op
"
;
// block per row
else
else
return
"wpr"
;
// warp per row
return
"wpr
_op
"
;
// warp per row
}();
}();
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
...
...
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
View file @
7fb9b2b6
...
@@ -40,9 +40,9 @@ struct Layernorm2dFwdPipelineTwoPass
...
@@ -40,9 +40,9 @@ struct Layernorm2dFwdPipelineTwoPass
static
constexpr
const
char
*
name
=
[]()
{
static
constexpr
const
char
*
name
=
[]()
{
if
constexpr
(
kNeedCrossWarpSync
)
if
constexpr
(
kNeedCrossWarpSync
)
return
"bpr"
;
// block per row
return
"bpr
_tp
"
;
// block per row
else
else
return
"wpr"
;
// warp per row
return
"wpr
_tp
"
;
// warp per row
}();
}();
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
...
@@ -151,8 +151,6 @@ struct Layernorm2dFwdPipelineTwoPass
...
@@ -151,8 +151,6 @@ struct Layernorm2dFwdPipelineTwoPass
ck_tile
::
index_t
stride_to_right_most_window
=
ck_tile
::
index_t
stride_to_right_most_window
=
row_size
%
Block_N
==
0
?
row_size
-
Block_N
:
row_size
-
row_size
%
Block_N
;
row_size
%
Block_N
==
0
?
row_size
-
Block_N
:
row_size
-
row_size
%
Block_N
;
// x_window.foo();
// gamma_window.foo();
move_tile_window
(
x_window
,
{
0
,
-
Block_N
});
move_tile_window
(
x_window
,
{
0
,
-
Block_N
});
move_tile_window
(
sx_window
,
{
0
,
-
Block_N
});
move_tile_window
(
sx_window
,
{
0
,
-
Block_N
});
move_tile_window
(
gamma_window
,
{
stride_to_right_most_window
});
move_tile_window
(
gamma_window
,
{
stride_to_right_most_window
});
...
...
include/ck_tile/ops/reduce.hpp
View file @
7fb9b2b6
...
@@ -4,4 +4,7 @@
...
@@ -4,4 +4,7 @@
#pragma once
#pragma once
#include "ck_tile/ops/reduce/block/block_reduce.hpp"
#include "ck_tile/ops/reduce/block/block_reduce.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
include/ck_tile/ops/reduce/block/block_reduce.hpp
View file @
7fb9b2b6
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
#include "ck_tile/core.hpp"
#include "ck_tile/core.hpp"
#include <tuple>
#include <tuple>
// This file is not support cross warp reduce
namespace
ck_tile
{
namespace
ck_tile
{
/*
/*
...
@@ -15,8 +16,8 @@ namespace ck_tile {
...
@@ -15,8 +16,8 @@ namespace ck_tile {
// synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
// synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
template
<
typename
AccDistributedTensor_
,
typename
ReduceFunc
,
bool
WithBroadcast
=
true
>
template
<
typename
AccDistributedTensor_
,
typename
ReduceFunc
,
bool
WithBroadcast
=
true
>
CK_TILE_DEVICE
void
block_tile_reduce_sync
(
AccDistributedTensor_
&
acc_tensor
,
CK_TILE_DEVICE
void
block_tile_reduce_sync
(
AccDistributedTensor_
&
acc_tensor
,
const
ReduceFunc
&
reduce_func
,
const
ReduceFunc
&
reduce_func
,
bool_constant
<
WithBroadcast
>
=
{})
bool_constant
<
WithBroadcast
>
=
{})
{
{
using
Dstr
=
typename
AccDistributedTensor_
::
StaticTileDistribution
;
using
Dstr
=
typename
AccDistributedTensor_
::
StaticTileDistribution
;
using
DstrEncode
=
typename
Dstr
::
DstrEncode
;
using
DstrEncode
=
typename
Dstr
::
DstrEncode
;
...
@@ -115,7 +116,7 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
...
@@ -115,7 +116,7 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
*/
*/
template
<
typename
AccDistributedTensor_
,
typename
ReduceFunc
>
template
<
typename
AccDistributedTensor_
,
typename
ReduceFunc
>
CK_TILE_DEVICE
void
block_tile_reduce_xor_sync
(
AccDistributedTensor_
&
acc_tensor
,
CK_TILE_DEVICE
void
block_tile_reduce_xor_sync
(
AccDistributedTensor_
&
acc_tensor
,
const
ReduceFunc
&
reduce_func
)
const
ReduceFunc
&
reduce_func
)
{
{
using
Dstr
=
typename
AccDistributedTensor_
::
StaticTileDistribution
;
using
Dstr
=
typename
AccDistributedTensor_
::
StaticTileDistribution
;
using
DstrEncode
=
typename
Dstr
::
DstrEncode
;
using
DstrEncode
=
typename
Dstr
::
DstrEncode
;
...
@@ -174,9 +175,9 @@ template <typename AccDistributedTensor_,
...
@@ -174,9 +175,9 @@ template <typename AccDistributedTensor_,
index_t
...
InReduceDims
,
index_t
...
InReduceDims
,
typename
ReduceFunc
>
typename
ReduceFunc
>
CK_TILE_DEVICE
void
block_tile_reduce
(
AccDistributedTensor_
&
acc_tensor
,
CK_TILE_DEVICE
void
block_tile_reduce
(
AccDistributedTensor_
&
acc_tensor
,
const
InDistributedTensor_
&
in_tensor
,
const
InDistributedTensor_
&
in_tensor
,
sequence
<
InReduceDims
...
>
,
sequence
<
InReduceDims
...
>
,
const
ReduceFunc
&
reduce_func
)
const
ReduceFunc
&
reduce_func
)
{
{
constexpr
auto
I0
=
number
<
0
>
{};
constexpr
auto
I0
=
number
<
0
>
{};
constexpr
auto
I1
=
number
<
1
>
{};
constexpr
auto
I1
=
number
<
1
>
{};
...
@@ -249,9 +250,9 @@ template <typename AccDataType_,
...
@@ -249,9 +250,9 @@ template <typename AccDataType_,
typename
ReduceFunc
,
typename
ReduceFunc
,
typename
InDataType_
>
typename
InDataType_
>
CK_TILE_DEVICE
auto
block_tile_reduce
(
const
InDistributedTensor_
&
in_tensor
,
CK_TILE_DEVICE
auto
block_tile_reduce
(
const
InDistributedTensor_
&
in_tensor
,
sequence
<
InReduceDims
...
>
in_reduce_dims
,
sequence
<
InReduceDims
...
>
in_reduce_dims
,
const
ReduceFunc
&
reduce_func
,
const
ReduceFunc
&
reduce_func
,
const
InDataType_
&
reduce_init
)
const
InDataType_
&
reduce_init
)
{
{
using
InDataType
=
typename
InDistributedTensor_
::
DataType
;
using
InDataType
=
typename
InDistributedTensor_
::
DataType
;
using
AccDataType
=
remove_cvref_t
<
AccDataType_
>
;
using
AccDataType
=
remove_cvref_t
<
AccDataType_
>
;
...
...
include/ck_tile/ops/reduce/block/block_reduce2d.hpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace
ck_tile
{
template
<
typename
Problem_
,
typename
Policy_
=
void
>
struct
BlockReduce2d
{
// in-thread reduction
using
Problem
=
remove_cvref_t
<
Problem_
>
;
using
XDataType
=
typename
Problem
::
XDataType
;
using
ComputeDataType
=
typename
Problem
::
ComputeDataType
;
CK_TILE_DEVICE
constexpr
BlockReduce2d
()
{}
template
<
typename
XDistributedTensor_
,
typename
YDistributedTensor_
,
typename
ReduceFunc
>
CK_TILE_DEVICE
void
operator
()(
const
XDistributedTensor_
&
x_tensor
,
YDistributedTensor_
&
y_tensor
,
const
ReduceFunc
&
reduce_func
)
{
constexpr
auto
I0
=
number
<
0
>
{};
constexpr
auto
I1
=
number
<
1
>
{};
constexpr
auto
spans
=
XDistributedTensor_
::
get_distributed_spans
();
// FIXME: hard coded to reduce 2nd axis
sweep_tile_span
(
spans
[
I0
],
[
&
](
auto
dstr_idx_i0
)
{
constexpr
auto
y_dstr_idx
=
make_tuple
(
dstr_idx_i0
);
auto
y
=
y_tensor
[
y_dstr_idx
];
sweep_tile_span
(
spans
[
I1
],
[
&
](
auto
dstr_idx_i1
)
{
constexpr
auto
in_dstr_idx
=
make_tuple
(
dstr_idx_i0
,
dstr_idx_i1
);
const
auto
x
=
ck_tile
::
type_convert
<
ComputeDataType
>
(
x_tensor
[
in_dstr_idx
]);
y
=
reduce_func
(
y
,
x
);
});
y_tensor
(
y_dstr_idx
)
=
y
;
});
}
template
<
typename
XDistributedTensor_
>
CK_TILE_DEVICE
static
auto
MakeYBlockTile
()
{
static_assert
(
std
::
is_same_v
<
XDataType
,
typename
XDistributedTensor_
::
DataType
>
,
"wrong!"
);
// FIXME: hard coded to reduce 2nd axis
constexpr
auto
reduce_dims
=
sequence
<
1
>
{};
constexpr
auto
dstr
=
make_static_tile_distribution
(
detail
::
make_reduce_tile_distribution_encoding
(
XDistributedTensor_
::
get_tile_distribution
()
.
get_static_tile_distribution_encoding
(),
reduce_dims
));
auto
tensor
=
make_static_distributed_tensor
<
ComputeDataType
>
(
dstr
);
return
tensor
;
}
template
<
typename
XDistributedTensor_
,
typename
ReduceFunc
>
CK_TILE_DEVICE
auto
operator
()(
const
XDistributedTensor_
&
x_tensor
,
const
ComputeDataType
&
reduce_init
,
const
ReduceFunc
&
reduce_func
)
{
auto
y_tensor
=
MakeYBlockTile
<
XDistributedTensor_
>
();
set_tile
(
y_tensor
,
reduce_init
);
(
*
this
)(
x_tensor
,
y_tensor
,
reduce_func
);
return
y_tensor
;
}
};
template
<
typename
Problem_
,
typename
Policy_
=
void
>
struct
BlockReduce2dSync
{
using
Problem
=
remove_cvref_t
<
Problem_
>
;
template
<
typename
YDistributedTensor_
,
typename
ReduceFunc
>
CK_TILE_DEVICE
void
operator
()(
YDistributedTensor_
&
y_tensor
,
const
ReduceFunc
&
reduce_func
)
{
using
Dstr
=
typename
YDistributedTensor_
::
StaticTileDistribution
;
using
DstrEncode
=
typename
Dstr
::
DstrEncode
;
using
DstrEncodeDetail
=
typename
DstrEncode
::
detail
;
constexpr
index_t
NDimP
=
Dstr
::
get_num_of_dimension_p
();
constexpr
index_t
NDimR
=
Dstr
::
get_num_of_dimension_r
();
constexpr
index_t
idim_p_lane
=
NDimP
-
1
;
// const auto ps_idx = make_array<index_t>(get_warp_id(), get_lane_id());
// const auto rs_idx =
// y_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
constexpr
index_t
thread_buf_size
=
YDistributedTensor_
::
get_thread_buffer_size
();
// loop over thread data
static_for
<
0
,
thread_buf_size
,
1
>
{}([
&
](
auto
i
)
{
auto
v_local
=
y_tensor
.
get_thread_buffer
()[
i
];
// cross-lane reduce for replication
// only reduce on R dimension correspond to lane
// (lane id maps to this R dimension)
static_for
<
0
,
NDimR
,
1
>
{}([
&
](
auto
idim_r
)
{
// FIXME: nasty to use does_p_own_r_
if
constexpr
(
DstrEncodeDetail
::
does_p_own_r_
[
idim_p_lane
][
idim_r
])
{
constexpr
index_t
r_length
=
DstrEncode
::
rs_lengths_
[
idim_r
];
constexpr
index_t
lid_over_rid_derivative
=
DstrEncodeDetail
::
ps_over_rs_derivative_
[
idim_p_lane
][
idim_r
];
static_assert
(
is_power_of_two_integer
(
r_length
),
"wrong! only support power of 2 reduction"
);
constexpr
index_t
nstage
=
integer_log2_floor
(
r_length
);
// reduction sweep forward
static_for
<
0
,
nstage
,
1
>
{}([
&
](
auto
istage
)
{
// xor
index_t
src_lane
=
(
__lane_id
())
^
(
number
<
lid_over_rid_derivative
<<
istage
.
value
>
{}.
value
);
// pull data from remote lane
const
auto
v_remote
=
warp_shuffle
(
v_local
,
src_lane
);
// reduce
v_local
=
reduce_func
(
v_local
,
v_remote
);
});
}
});
// TODO - Do we need to broadcast to other lane?
y_tensor
.
get_thread_buffer
()(
i
)
=
v_local
;
});
}
};
template
<
typename
Problem_
,
typename
Policy_
=
void
>
struct
BlockReduce2dCrossWarpSync
{
using
Problem
=
remove_cvref_t
<
Problem_
>
;
using
BlockShape
=
typename
Problem
::
BlockShape
;
template
<
typename
YDistributedTensor_
>
CK_TILE_DEVICE
static
constexpr
index_t
GetReduceWarps
()
{
constexpr
index_t
num_reduce_warps
=
[
&
]()
{
using
Dstr
=
typename
YDistributedTensor_
::
StaticTileDistribution
;
using
DstrEncode
=
typename
Dstr
::
DstrEncode
;
using
DstrEncodeDetail
=
typename
DstrEncode
::
detail
;
constexpr
index_t
NDimR
=
Dstr
::
get_num_of_dimension_r
();
constexpr
index_t
idim_p_warp
=
0
;
index_t
len_
=
1
;
static_for
<
0
,
NDimR
,
1
>
{}([
&
](
auto
idim_r
)
{
if
constexpr
(
DstrEncodeDetail
::
does_p_own_r_
[
idim_p_warp
][
idim_r
])
{
constexpr
index_t
r_length
=
DstrEncode
::
rs_lengths_
[
idim_r
];
len_
*=
r_length
;
}
});
return
len_
;
}();
return
num_reduce_warps
;
}
// return in byte
template
<
typename
YDistributedTensor_
>
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
{
using
DataType
=
typename
YDistributedTensor_
::
DataType
;
// constexpr auto num_reduce_warps = GetReduceWarps<YDistributedTensor_>();
constexpr
index_t
thread_buf_size
=
YDistributedTensor_
::
get_thread_buffer_size
();
// we need to store all data from every wave into smem
// e.g. 2x2 reduce along N
// -------------> reduce N
// | w0 | w1 | ___> | w01 |
// | w2 | w3 | | w23 |
//
// -> store data from every wave into LDS
//
//
// -------------> reduce N
// | w0 | w1 | w2 | w3 | -----> | w0123 |
//
// -> also store data from every wave into LDS
constexpr
index_t
num_warps
=
BlockShape
::
BlockSize
/
warpSize
;
return
num_warps
*
thread_buf_size
*
sizeof
(
DataType
);
}
template
<
typename
YDistributedTensor_
,
typename
ReduceFunc
>
CK_TILE_DEVICE
void
operator
()(
YDistributedTensor_
&
y_tensor
,
void
*
smem
,
const
ReduceFunc
&
reduce_func
)
{
using
DataType
=
typename
YDistributedTensor_
::
DataType
;
constexpr
index_t
thread_buf_size
=
YDistributedTensor_
::
get_thread_buffer_size
();
DataType
*
smem_ptr
=
reinterpret_cast
<
DataType
*>
(
smem
);
const
index_t
lane_id
=
get_lane_id
();
const
index_t
warp_id
=
get_warp_id
();
constexpr
auto
num_reduce_warps
=
GetReduceWarps
<
YDistributedTensor_
>
();
constexpr
index_t
num_warps
=
BlockShape
::
BlockSize
/
warpSize
;
const
index_t
smem_offset
=
warp_id
;
// skip if nonthing to do
if
constexpr
(
num_reduce_warps
==
1
)
return
;
// store into smem only for lane-0 within one warp
if
(
lane_id
==
0
)
{
static_for
<
0
,
thread_buf_size
,
1
>
{}([
&
](
auto
i
)
{
smem_ptr
[
smem_offset
+
i
*
num_warps
]
=
y_tensor
.
get_thread_buffer
()[
i
];
});
}
block_sync_lds
();
// load from smem. here we let everythread to do compute :)
index_t
local_warp_id
=
warp_id
/
num_reduce_warps
;
index_t
local_smem_os
=
local_warp_id
*
num_reduce_warps
;
DataType
all_scratch
[
thread_buf_size
*
num_reduce_warps
];
static_for
<
0
,
thread_buf_size
,
1
>
{}([
&
](
auto
i_0
)
{
static_for
<
0
,
num_reduce_warps
,
1
>
{}([
&
](
auto
i_1
)
{
all_scratch
[
i_0
*
num_reduce_warps
+
i_1
]
=
smem_ptr
[
i_0
*
num_warps
+
local_smem_os
+
i_1
];
});
});
block_sync_lds
();
// TODO: we don't need sync here
static_for
<
0
,
thread_buf_size
,
1
>
{}([
&
](
auto
i_0
)
{
// TODO: use descriptor for this
auto
v_local
=
all_scratch
[
i_0
*
num_reduce_warps
];
// further reduce mean/var
static_for
<
0
,
num_reduce_warps
-
1
,
1
>
{}([
&
](
auto
i_1_n1
)
{
constexpr
auto
i_1
=
number
<
i_1_n1
+
1
>
{};
const
DataType
v_remote
=
all_scratch
[
i_0
*
num_reduce_warps
+
i_1
];
// reduce
v_local
=
reduce_func
(
v_local
,
v_remote
);
});
y_tensor
.
get_thread_buffer
()(
i_0
)
=
v_local
;
});
}
};
}
// namespace ck_tile
include/ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
namespace
ck_tile
{
struct
BlockReduce2dDefaultPolicy
{
template
<
typename
Problem
>
CK_TILE_DEVICE
static
constexpr
auto
MakeXBlockTileDistribution
()
{
using
S
=
typename
Problem
::
BlockShape
;
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
S
::
Repeat_M
,
S
::
WarpPerBlock_M
,
S
::
ThreadPerWarp_M
,
S
::
Vector_M
>
,
sequence
<
S
::
Repeat_N
,
S
::
WarpPerBlock_N
,
S
::
ThreadPerWarp_N
,
S
::
Vector_N
>>
,
tuple
<
sequence
<
1
,
2
>
,
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
,
1
>
,
sequence
<
2
,
2
>>
,
sequence
<
1
,
1
,
2
,
2
>
,
sequence
<
0
,
3
,
0
,
3
>>
{});
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetBlockReduce2d
()
{
using
P_
=
BlockReduce2dProblem
<
typename
Problem
::
XDataType
,
typename
Problem
::
ComputeDataType
,
typename
Problem
::
BlockShape
>
;
return
BlockReduce2d
<
P_
>
{};
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetBlockReduce2dSync
()
{
using
P_
=
BlockReduce2dProblem
<
typename
Problem
::
XDataType
,
typename
Problem
::
ComputeDataType
,
typename
Problem
::
BlockShape
>
;
return
BlockReduce2dSync
<
P_
>
{};
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetBlockReduce2dCrossWarpSync
()
{
using
P_
=
BlockReduce2dProblem
<
typename
Problem
::
XDataType
,
typename
Problem
::
ComputeDataType
,
typename
Problem
::
BlockShape
>
;
return
BlockReduce2dCrossWarpSync
<
P_
>
{};
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
{
if
constexpr
(
Problem
::
kNeedCrossWarpSync
)
{
using
P_
=
BlockReduce2dProblem
<
typename
Problem
::
XDataType
,
typename
Problem
::
ComputeDataType
,
typename
Problem
::
BlockShape
>
;
using
block_reduce2d
=
BlockReduce2d
<
P_
>
;
using
x_block_tile
=
decltype
(
make_static_distributed_tensor
<
typename
Problem
::
XDataType
>
(
MakeXBlockTileDistribution
<
Problem
>
()));
using
y_block_tile
=
decltype
(
block_reduce2d
::
template
MakeYBlockTile
<
x_block_tile
>());
return
GetBlockReduce2dCrossWarpSync
<
Problem
>
().
template
GetSmemSize
<
y_block_tile
>();
}
else
{
return
1
;
// zero size arrays are an extension
}
}
};
}
// namespace ck_tile
include/ck_tile/ops/reduce/block/block_reduce2d_problem.hpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace
ck_tile
{
template
<
typename
XDataType_
,
typename
ComputeDataType_
,
typename
BlockShape_
>
struct
BlockReduce2dProblem
{
using
XDataType
=
remove_cvref_t
<
XDataType_
>
;
using
ComputeDataType
=
remove_cvref_t
<
ComputeDataType_
>
;
using
BlockShape
=
remove_cvref_t
<
BlockShape_
>
;
};
}
// namespace ck_tile
include/ck_tile/ops/rmsnorm2d.hpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp"
#include "ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp"
#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp"
#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp"
#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_kernel.hpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
namespace
ck_tile
{
// host side args
struct
Rmsnorm2dFwdHostArgs
{
const
void
*
p_x
;
const
void
*
p_gamma
;
void
*
p_y
;
void
*
p_invRms
;
float
epsilon
;
index_t
m
;
index_t
n
;
index_t
stride
;
// row_stride
};
// TODO: Extract some type to wrapper class
template
<
typename
Pipeline_
>
struct
Rmsnorm2dFwd
{
using
Pipeline
=
remove_cvref_t
<
Pipeline_
>
;
using
Problem
=
typename
Pipeline
::
Problem
;
using
XDataType
=
remove_cvref_t
<
typename
Problem
::
XDataType
>
;
using
GammaDataType
=
remove_cvref_t
<
typename
Problem
::
GammaDataType
>
;
using
ComputeDataType
=
remove_cvref_t
<
typename
Problem
::
ComputeDataType
>
;
using
YDataType
=
remove_cvref_t
<
typename
Problem
::
YDataType
>
;
using
InvRmsDataType
=
remove_cvref_t
<
typename
Problem
::
InvRmsDataType
>
;
static
constexpr
bool
kHasGamma
=
!
std
::
is_same_v
<
GammaDataType
,
null_type
>
;
static
constexpr
bool
kSaveInvRms
=
Problem
::
kSaveInvRms
;
static
constexpr
index_t
Block_M
=
Problem
::
BlockShape
::
Block_M
;
static
constexpr
index_t
Block_N
=
Problem
::
BlockShape
::
Block_N
;
static
constexpr
bool
kPadM
=
false
;
// always no need to pad along M
static
constexpr
bool
kPadN
=
Problem
::
kPadN
;
static
constexpr
bool
kTwoPass
=
Problem
::
kTwoPass
;
static
constexpr
index_t
ThreadPerWarp_N
=
Problem
::
BlockShape
::
ThreadPerWarp_N
;
static
constexpr
index_t
Vector_N
=
Problem
::
BlockShape
::
Vector_N
;
static
constexpr
index_t
Repeat_N
=
Problem
::
BlockShape
::
Repeat_N
;
static
constexpr
auto
I0
=
number
<
0
>
{};
static
constexpr
auto
I1
=
number
<
1
>
{};
struct
Kargs
{
const
void
*
p_x
;
const
void
*
p_gamma
;
void
*
p_y
;
void
*
p_invRms
;
float
epsilon
;
index_t
m
;
index_t
n
;
index_t
stride
;
// row_stride
};
using
Hargs
=
Rmsnorm2dFwdHostArgs
;
CK_TILE_HOST
static
constexpr
Kargs
MakeKargs
(
const
Hargs
&
hargs
)
{
return
Kargs
{
hargs
.
p_x
,
hargs
.
p_gamma
,
hargs
.
p_y
,
hargs
.
p_invRms
,
hargs
.
epsilon
,
hargs
.
m
,
hargs
.
n
,
hargs
.
stride
};
}
CK_TILE_HOST
static
constexpr
auto
GridSize
(
const
Hargs
&
hargs
)
{
return
(
hargs
.
m
+
Block_M
-
1
)
/
Block_M
;
}
CK_TILE_HOST
static
constexpr
auto
BlockSize
()
{
return
Problem
::
BlockShape
::
BlockSize
;
}
// clang-format off
template
<
typename
T
>
struct
t2s
;
template
<
>
struct
t2s
<
float
>
{
static
constexpr
const
char
*
name
=
"fp32"
;
};
template
<
>
struct
t2s
<
ck_tile
::
fp16_t
>
{
static
constexpr
const
char
*
name
=
"fp16"
;
};
template
<
>
struct
t2s
<
ck_tile
::
bf16_t
>
{
static
constexpr
const
char
*
name
=
"bf16"
;
};
template
<
>
struct
t2s
<
ck_tile
::
fp8_t
>
{
static
constexpr
const
char
*
name
=
"fp8"
;
};
template
<
>
struct
t2s
<
ck_tile
::
bf8_t
>
{
static
constexpr
const
char
*
name
=
"bf8"
;
};
// clang-format on
// in byte
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
{
return
Pipeline
::
GetSmemSize
();
}
CK_TILE_HOST
static
std
::
string
GetName
()
{
// clang-format off
using
S_
=
typename
Problem
::
BlockShape
;
auto
surfix
=
[
&
]
()
{
std
::
string
n
;
if
(
kPadN
)
n
+=
"_pn"
;
if
(
kSaveInvRms
)
n
+=
"_rms"
;
if
(
kTwoPass
)
n
+=
"_2p"
;
return
n
;
}();
#define _SS_ std::string
#define _TS_ std::to_string
return
_SS_
(
"rmsnorm2d_fwd_"
)
+
_SS_
(
t2s
<
XDataType
>::
name
)
+
"_"
+
_TS_
(
S_
::
Block_M
)
+
"x"
+
_TS_
(
S_
::
Block_N
)
+
"_"
+
_TS_
(
S_
::
WarpPerBlock_M
)
+
"x"
+
_TS_
(
S_
::
WarpPerBlock_N
)
+
"_"
+
_TS_
(
S_
::
Warp_M
)
+
"x"
+
_TS_
(
S_
::
Warp_N
)
+
"_"
+
_TS_
(
S_
::
Vector_M
)
+
"x"
+
_TS_
(
S_
::
Vector_N
)
+
"_"
+
_SS_
(
Pipeline
::
name
)
+
surfix
;
#undef _SS_
#undef _TS_
// clang-format on
}
CK_TILE_DEVICE
void
operator
()(
Kargs
kargs
)
const
{
const
auto
iM
=
get_block_id
()
*
Block_M
;
const
auto
x_window
=
[
&
]()
{
const
auto
tmp_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
const
XDataType
*>
(
kargs
.
p_x
),
make_tuple
(
kargs
.
m
,
kargs
.
n
),
make_tuple
(
kargs
.
stride
,
1
),
number
<
Vector_N
>
{},
number
<
1
>
{});
const
auto
tmp2_
=
pad_tensor_view
(
tmp_
,
make_tuple
(
number
<
Block_M
>
{},
number
<
Block_N
>
{}),
sequence
<
kPadM
,
kPadN
>
{});
return
make_tile_window
(
tmp2_
,
make_tuple
(
number
<
Block_M
>
{},
number
<
Block_N
>
{}),
{
iM
,
0
});
}();
const
auto
gamma_window
=
[
&
]()
{
const
auto
tmp_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
const
GammaDataType
*>
(
kargs
.
p_gamma
),
make_tuple
(
kargs
.
n
),
make_tuple
(
1
),
number
<
Vector_N
>
{},
number
<
1
>
{});
const
auto
tmp2_
=
pad_tensor_view
(
tmp_
,
make_tuple
(
number
<
Block_N
>
{}),
sequence
<
kPadM
>
{});
return
make_tile_window
(
tmp2_
,
make_tuple
(
number
<
Block_N
>
{}),
{
0
});
}();
auto
y_window
=
[
&
]()
{
auto
tmp_
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
YDataType
*>
(
kargs
.
p_y
),
make_tuple
(
kargs
.
m
,
kargs
.
n
),
make_tuple
(
kargs
.
stride
,
1
),
number
<
Vector_N
>
{},
number
<
1
>
{});
auto
tmp2_
=
pad_tensor_view
(
tmp_
,
make_tuple
(
number
<
Block_M
>
{},
number
<
Block_N
>
{}),
sequence
<
kPadM
,
kPadN
>
{});
return
make_tile_window
(
tmp2_
,
make_tuple
(
number
<
Block_M
>
{},
number
<
Block_N
>
{}),
{
iM
,
0
});
}();
auto
inv_rms_window
=
[
&
]()
{
if
constexpr
(
kSaveInvRms
)
{
const
auto
inv_rms_m
=
[
&
]()
{
const
auto
inv_rms_dram_naive
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
static_cast
<
InvRmsDataType
*>
(
kargs
.
p_invRms
),
make_tuple
(
kargs
.
m
),
number
<
1
>
{});
return
pad_tensor_view
(
inv_rms_dram_naive
,
make_tuple
(
number
<
Block_M
>
{}),
sequence
<
kPadM
>
{});
}();
return
make_tile_window
(
inv_rms_m
,
make_tuple
(
number
<
Block_M
>
{}),
{
iM
});
}
else
return
make_null_tile_window
(
make_tuple
(
number
<
Block_M
>
{}));
}();
__shared__
char
smem
[
GetSmemSize
()];
Pipeline
{}(
x_window
,
gamma_window
,
y_window
,
inv_rms_window
,
static_cast
<
const
ComputeDataType
>
(
kargs
.
epsilon
),
kargs
.
n
,
smem
);
}
};
}
// namespace ck_tile
include/ck_tile/ops/rmsnorm2d/kernel/rmsnorm2d_fwd_shape.hpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace
ck_tile
{
/*
// clang-format off
4-level descriptor: BlockTile-> WarpPerBlock-> WarpTile-> Vector
Block_N (Warp_N * WarpPerBlock_N * Repeat_N )
+<----------------------< Repeat_N(2)>--------------------->+
| |
+<-- <WarpPerBlock_N(2)> -->+
Warp_N
+--------------+--------------+--------------+--------------+----+----------------+
Warp_M | wrap_0 | wrap_1 | | ^ ^
+--------------+--------------+ | <WarpPerBlock_M(2)> |
| wrap_2 | wrap_3 | | v
+--------------+--------------+--------------+--------------+----+ Block_M
| | |
+ + |
| | | v
+--------------+--------------+--------------+--------------+ +
each Warp-tile (e.g 16 thrd per row)
Vector_N (contiguous pixels each thrd holds along N, or vector size)
+-----------+-----------+-----------+-----------+-----------+
| thrd_0 | thrd_1 | thrd_2 | thrd_3 | ... Vector_M
+-----------+-----------+-----------+-----------+-----------+
| thrd_16 | thrd_17 | thrd_18 | thrd_19 | ...
+-----------+-----------+-----------+-----------+-----------+
// clang-format on
*/
template
<
typename
BlockTile_
,
// block size, seq<M, N>
typename
WarpPerBlock_
,
// num warps along seq<M, N>
typename
WarpTile_
,
// warp size, seq<M, N>
typename
Vector_
,
// contiguous pixels(vector size) along seq<M, N>
index_t
BlockSize_
=
warpSize
*
reduce_on_sequence
(
WarpPerBlock_
{}
,
multiplies
{}
,
number
<
1
>{})
>
struct
Rmsnorm2dShape
{
// block size
static
constexpr
index_t
Block_M
=
BlockTile_
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Block_N
=
BlockTile_
::
at
(
number
<
1
>
{});
// num warps along seq<M, N>, within each block
static
constexpr
index_t
WarpPerBlock_M
=
WarpPerBlock_
::
at
(
number
<
0
>
{});
static
constexpr
index_t
WarpPerBlock_N
=
WarpPerBlock_
::
at
(
number
<
1
>
{});
// warp size
static
constexpr
index_t
Warp_M
=
WarpTile_
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Warp_N
=
WarpTile_
::
at
(
number
<
1
>
{});
static_assert
(
Block_M
%
(
WarpPerBlock_M
*
Warp_M
)
==
0
);
static_assert
(
Block_N
%
(
WarpPerBlock_N
*
Warp_N
)
==
0
);
// repeat of each thread along seq<M, N>
static
constexpr
index_t
Repeat_M
=
Block_M
/
(
WarpPerBlock_M
*
Warp_M
);
static
constexpr
index_t
Repeat_N
=
Block_N
/
(
WarpPerBlock_N
*
Warp_N
);
// vector size along seq<M, N>
static
constexpr
index_t
Vector_M
=
Vector_
::
at
(
number
<
0
>
{});
static
constexpr
index_t
Vector_N
=
Vector_
::
at
(
number
<
1
>
{});
static_assert
(
Warp_M
%
Vector_M
==
0
);
static_assert
(
Warp_N
%
Vector_N
==
0
);
// num of threads along seq<M, N>, within each warp
static
constexpr
index_t
ThreadPerWarp_M
=
Warp_M
/
Vector_M
;
static
constexpr
index_t
ThreadPerWarp_N
=
Warp_N
/
Vector_N
;
static
constexpr
index_t
BlockSize
=
BlockSize_
;
};
}
// namespace ck_tile
include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
namespace
ck_tile
{
struct
Rmsnorm2dFwdPipelineDefaultPolicy
{
template
<
typename
Problem
>
CK_TILE_DEVICE
static
constexpr
auto
MakeXBlockTileDistribution
()
{
using
S
=
typename
Problem
::
BlockShape
;
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<>
,
tuple
<
sequence
<
S
::
Repeat_M
,
S
::
WarpPerBlock_M
,
S
::
ThreadPerWarp_M
,
S
::
Vector_M
>
,
sequence
<
S
::
Repeat_N
,
S
::
WarpPerBlock_N
,
S
::
ThreadPerWarp_N
,
S
::
Vector_N
>>
,
tuple
<
sequence
<
1
,
2
>
,
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
,
1
>
,
sequence
<
2
,
2
>>
,
sequence
<
1
,
1
,
2
,
2
>
,
sequence
<
0
,
3
,
0
,
3
>>
{});
}
template
<
typename
Problem
>
CK_TILE_DEVICE
static
constexpr
auto
MakeGammaBlockTileDistribution
()
{
using
S
=
typename
Problem
::
BlockShape
;
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
S
::
WarpPerBlock_M
,
S
::
ThreadPerWarp_M
>
,
tuple
<
sequence
<
S
::
Repeat_N
,
S
::
WarpPerBlock_N
,
S
::
ThreadPerWarp_N
,
S
::
Vector_N
>>
,
tuple
<
sequence
<
0
,
1
>
,
sequence
<
0
,
1
>>
,
tuple
<
sequence
<
0
,
1
>
,
sequence
<
1
,
2
>>
,
sequence
<
1
,
1
>
,
sequence
<
0
,
3
>>
{});
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetBlockReduce2d
()
{
using
P_
=
BlockReduce2dProblem
<
typename
Problem
::
XDataType
,
typename
Problem
::
ComputeDataType
,
typename
Problem
::
BlockShape
>
;
return
BlockReduce2d
<
P_
>
{};
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetBlockReduce2dSync
()
{
using
P_
=
BlockReduce2dProblem
<
typename
Problem
::
XDataType
,
typename
Problem
::
ComputeDataType
,
typename
Problem
::
BlockShape
>
;
return
BlockReduce2dSync
<
P_
>
{};
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetBlockReduce2dCrossWarpSync
()
{
using
P_
=
BlockReduce2dProblem
<
typename
Problem
::
XDataType
,
typename
Problem
::
ComputeDataType
,
typename
Problem
::
BlockShape
>
;
return
BlockReduce2dCrossWarpSync
<
P_
>
{};
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
{
if
constexpr
(
Problem
::
kNeedCrossWarpSync
)
{
using
P_
=
BlockReduce2dProblem
<
typename
Problem
::
XDataType
,
typename
Problem
::
ComputeDataType
,
typename
Problem
::
BlockShape
>
;
using
block_reduce2d
=
BlockReduce2d
<
P_
>
;
using
x_block_tile
=
decltype
(
make_static_distributed_tensor
<
typename
Problem
::
XDataType
>
(
MakeXBlockTileDistribution
<
Problem
>
()));
using
y_block_tile
=
decltype
(
block_reduce2d
::
template
MakeYBlockTile
<
x_block_tile
>());
return
GetBlockReduce2dCrossWarpSync
<
Problem
>
().
template
GetSmemSize
<
y_block_tile
>();
}
else
{
return
1
;
// zero size arrays are an extension
}
}
};
}
// namespace ck_tile
include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_one_pass.hpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_default_policy.hpp"
#include <string>
#include <type_traits>
namespace
ck_tile
{
template
<
typename
Problem_
,
typename
Policy_
=
Rmsnorm2dFwdPipelineDefaultPolicy
>
struct
Rmsnorm2dFwdPipelineOnePass
{
using
Problem
=
ck_tile
::
remove_cvref_t
<
Problem_
>
;
using
Policy
=
ck_tile
::
remove_cvref_t
<
Policy_
>
;
using
XDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
XDataType
>
;
using
GammaDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
GammaDataType
>
;
using
ComputeDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
ComputeDataType
>
;
using
YDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
YDataType
>
;
using
InvRmsDataType
=
ck_tile
::
remove_cvref_t
<
typename
Problem
::
InvRmsDataType
>
;
static
constexpr
bool
kHasGamma
=
!
std
::
is_same_v
<
GammaDataType
,
ck_tile
::
null_type
>
;
static
constexpr
bool
kSaveInvRms
=
Problem
::
kSaveInvRms
;
static
constexpr
bool
kNeedCrossWarpSync
=
Problem
::
kNeedCrossWarpSync
;
static
constexpr
bool
kPadM
=
false
;
// TODO - BlockRmsnorm2dFwdProblem::kPadM
static
constexpr
bool
kPadN
=
Problem
::
kPadN
;
static
constexpr
const
char
*
name
=
[]()
{
if
constexpr
(
kNeedCrossWarpSync
)
return
"bpr_op"
;
// block per row
else
return
"wpr_op"
;
// warp per row
}();
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
{
return
Policy
::
template
GetSmemSize
<
Problem
>();
}
template
<
typename
XWindow
,
typename
GammaWindow
,
typename
YWindow
,
typename
InvRmsWindow
>
CK_TILE_DEVICE
auto
operator
()(
const
XWindow
&
x_window_
,
const
GammaWindow
&
gamma_window_
,
YWindow
&
y_window
,
InvRmsWindow
&
inv_rms_window
,
ComputeDataType
epsilon
,
ck_tile
::
index_t
row_size
,
void
*
smem
)
const
{
const
auto
x_window
=
make_tile_window
(
x_window_
,
Policy
::
template
MakeXBlockTileDistribution
<
Problem
>());
const
auto
gamma_window
=
make_tile_window
(
gamma_window_
,
Policy
::
template
MakeGammaBlockTileDistribution
<
Problem
>());
auto
reduce_square_sum_func
=
ReduceOp
::
SquareAdd
{};
auto
reduce_sum_func
=
ReduceOp
::
Add
{};
auto
block_reduce2d
=
Policy
::
template
GetBlockReduce2d
<
Problem
>();
auto
block_reduce2d_sync
=
Policy
::
template
GetBlockReduce2dSync
<
Problem
>();
auto
block_reduce2d_cross_warp_sync
=
Policy
::
template
GetBlockReduce2dCrossWarpSync
<
Problem
>();
const
auto
x
=
load_tile
(
x_window
);
// load gamma (TODO: support no gamma?)
const
auto
gamma
=
load_tile
(
gamma_window
);
// compute mean square each-thread->cross-lane->cross-warp
auto
square_sum
=
block_reduce2d
(
x
,
reduce_square_sum_func
.
GetIdentityValue
<
ComputeDataType
>
(),
reduce_square_sum_func
);
block_reduce2d_sync
(
square_sum
,
reduce_sum_func
);
block_reduce2d_cross_warp_sync
(
square_sum
,
smem
,
reduce_sum_func
);
// compute inv-rms
auto
inv_rms
=
tile_elementwise_in
(
[
&
](
const
auto
&
v_
)
{
return
type_convert
<
ComputeDataType
>
(
1.0
f
)
/
(
sqrt
(
v_
/
row_size
+
epsilon
));
},
square_sum
);
if
constexpr
(
kSaveInvRms
)
store_tile
(
inv_rms_window
,
cast_tile
<
InvRmsDataType
>
(
inv_rms
));
// rmsnorm computation
auto
y
=
make_static_distributed_tensor
<
YDataType
>
(
x
.
get_tile_distribution
());
sweep_tile
(
y
,
[
&
,
inv_rms_
=
inv_rms
](
auto
idx
)
{
constexpr
auto
i_idx
=
make_tuple
(
idx
[
number
<
0
>
{}]);
constexpr
auto
j_idx
=
make_tuple
(
idx
[
number
<
1
>
{}]);
const
auto
gamma_
=
type_convert
<
ComputeDataType
>
(
gamma
[
j_idx
]);
const
auto
x_
=
type_convert
<
ComputeDataType
>
(
x
[
idx
]);
auto
y_
=
x_
*
inv_rms_
[
i_idx
]
*
gamma_
;
y
(
idx
)
=
type_convert
<
YDataType
>
(
y_
);
});
store_tile
(
y_window
,
y
);
}
};
}
// namespace ck_tile
include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_problem.hpp
0 → 100644
View file @
7fb9b2b6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/utility/type_traits.hpp"
namespace
ck_tile
{
template
<
typename
XDataType_
,
typename
GammaDataType_
,
typename
ComputeDataType_
,
typename
YDataType_
,
typename
InvRmsDataType_
,
typename
BlockShape_
,
bool
kPadN_
,
bool
kSaveInvRms_
,
bool
kTwoPass_
>
struct
Rmsnorm2dFwdPipelineProblem
{
using
XDataType
=
remove_cvref_t
<
XDataType_
>
;
using
GammaDataType
=
remove_cvref_t
<
GammaDataType_
>
;
using
ComputeDataType
=
remove_cvref_t
<
ComputeDataType_
>
;
using
YDataType
=
remove_cvref_t
<
YDataType_
>
;
using
InvRmsDataType
=
remove_cvref_t
<
InvRmsDataType_
>
;
using
BlockShape
=
remove_cvref_t
<
BlockShape_
>
;
static
constexpr
bool
kNeedCrossLaneSync
=
BlockShape
::
ThreadPerWarp_N
>
1
;
static
constexpr
bool
kNeedCrossWarpSync
=
BlockShape
::
WarpPerBlock_N
>
1
;
static
constexpr
bool
kPadN
=
kPadN_
;
static
constexpr
bool
kSaveInvRms
=
kSaveInvRms_
;
static
constexpr
bool
kTwoPass
=
kTwoPass_
;
};
}
// namespace ck_tile
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment