Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
72c9f129
Commit
72c9f129
authored
Sep 20, 2024
by
Jun Liu
Browse files
Merge branch 'amd-develop' into amd-master
parents
241c261f
ded0d83d
Changes
235
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1777 additions
and
270 deletions
+1777
-270
docs/sphinx/requirements.txt
docs/sphinx/requirements.txt
+1
-1
example/01_gemm/gemm_xdl_fp8.cpp
example/01_gemm/gemm_xdl_fp8.cpp
+2
-2
example/01_gemm/run_gemm_example.inc
example/01_gemm/run_gemm_example.inc
+5
-5
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+28
-1
example/12_reduce/reduce_example_common.hpp
example/12_reduce/reduce_example_common.hpp
+3
-2
example/20_grouped_conv_bwd_weight/common.hpp
example/20_grouped_conv_bwd_weight/common.hpp
+2
-6
example/62_convnd_activ/CMakeLists.txt
example/62_convnd_activ/CMakeLists.txt
+1
-0
example/62_convnd_activ/convscale_reduce/CMakeLists.txt
example/62_convnd_activ/convscale_reduce/CMakeLists.txt
+14
-0
example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
...v/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
+502
-0
example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp
...iv/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp
+82
-0
example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
...nvscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
+82
-0
example/62_convnd_activ/convscale_reduce/run_convnd_fwd_example.inc
..._convnd_activ/convscale_reduce/run_convnd_fwd_example.inc
+98
-0
example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp
example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp
+1
-0
example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp
...gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp
+13
-5
example/CMakeLists.txt
example/CMakeLists.txt
+14
-0
example/ck_tile/01_fmha/CMakeLists.txt
example/ck_tile/01_fmha/CMakeLists.txt
+48
-7
example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+29
-1
example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+370
-179
example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+355
-0
example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+127
-61
No files found.
docs/sphinx/requirements.txt
View file @
72c9f129
...
@@ -103,7 +103,7 @@ requests==2.32.3
...
@@ -103,7 +103,7 @@ requests==2.32.3
# via
# via
# pygithub
# pygithub
# sphinx
# sphinx
rocm-docs-core==1.
6
.2
rocm-docs-core==1.
7
.2
# via -r requirements.in
# via -r requirements.in
six==1.16.0
six==1.16.0
# via pybtex
# via pybtex
...
...
example/01_gemm/gemm_xdl_fp8.cpp
View file @
72c9f129
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "common.hpp"
...
@@ -7,7 +7,7 @@
...
@@ -7,7 +7,7 @@
using
ADataType
=
ck
::
f8_t
;
using
ADataType
=
ck
::
f8_t
;
using
BDataType
=
ck
::
f8_t
;
using
BDataType
=
ck
::
f8_t
;
using
CDataType
=
ck
::
hal
f_t
;
using
CDataType
=
ck
::
f
8
_t
;
using
AccDataType
=
float
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
float
;
using
CShuffleDataType
=
float
;
...
...
example/01_gemm/run_gemm_example.inc
View file @
72c9f129
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -34,11 +34,11 @@ inline __host__ __device__ constexpr double get_rtol()
...
@@ -34,11 +34,11 @@ inline __host__ __device__ constexpr double get_rtol()
}
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
{
return
1
e
-
1
;
// 240 and 224 are acceptable
return
2
e
-
1
;
}
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
{
return
1.5
e-1
;
// 57344 and 49152 are acceptable
return
2
e
-
1
;
}
}
else
else
{
{
...
@@ -75,11 +75,11 @@ inline __host__ __device__ constexpr double get_atol()
...
@@ -75,11 +75,11 @@ inline __host__ __device__ constexpr double get_atol()
}
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
{
return
16.1
;
// 240 and 224 are acceptable
return
2
e
-
1
;
}
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
{
return
8192.1
;
// 57344 and 49152 are acceptable
return
2
e
-
1
;
}
}
else
else
{
{
...
...
example/12_reduce/reduce_blockwise.cpp
View file @
72c9f129
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <initializer_list>
#include <initializer_list>
...
@@ -255,34 +255,61 @@ int main(int argc, char* argv[])
...
@@ -255,34 +255,61 @@ int main(int argc, char* argv[])
else
else
{
{
// for testing half_t
// for testing half_t
pass
=
pass
&&
reduce_blockwise_test
<
ck
::
half_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
=
pass
&&
reduce_blockwise_test
<
ck
::
half_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
pass
&&
reduce_blockwise_test
<
ck
::
half_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
// for testing float
// for testing float
pass
=
pass
&&
reduce_blockwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
&&
reduce_blockwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
pass
=
pass
&&
reduce_blockwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
// for testing double
// for testing double
pass
=
pass
&&
reduce_blockwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
&&
reduce_blockwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
pass
=
pass
&&
reduce_blockwise_test
<
float
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
// for testing bhalf_t
// for testing bhalf_t
pass
=
pass
&&
reduce_blockwise_test
<
ck
::
bhalf_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
&&
pass
=
pass
&&
reduce_blockwise_test
<
ck
::
bhalf_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
reduce_blockwise_test
<
ck
::
bhalf_t
,
float
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
// for testing int8_t
// for testing int8_t
pass
=
pass
&&
reduce_blockwise_test
<
int8_t
,
int32_t
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
=
pass
&&
reduce_blockwise_test
<
int8_t
,
int32_t
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
pass
&&
reduce_blockwise_test
<
int8_t
,
int32_t
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
// for testing int4_t using AVG operation
// for testing int4_t using AVG operation
pass
=
pass
&&
reduce_blockwise_test
<
int4_t
,
int32_t
,
ReduceTensorOp
::
AVG
,
false
,
false
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
&&
reduce_blockwise_test
<
int4_t
,
int32_t
,
ReduceTensorOp
::
AVG
,
false
,
false
>
(
pass
=
pass
&&
reduce_blockwise_test
<
int4_t
,
int32_t
,
ReduceTensorOp
::
AVG
,
false
,
false
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
// for testing int4_t using MAX operation
// for testing int4_t using MAX operation
pass
=
pass
&&
reduce_blockwise_test
<
int4_t
,
int8_t
,
ReduceTensorOp
::
MAX
,
false
,
false
>
(
true
,
2
,
true
,
{
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
pass
=
pass
&&
reduce_blockwise_test
<
int4_t
,
int8_t
,
ReduceTensorOp
::
MAX
,
false
,
false
>
(
pass
=
pass
&&
reduce_blockwise_test
<
int4_t
,
int8_t
,
ReduceTensorOp
::
MAX
,
false
,
false
>
(
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
true
,
2
,
true
,
{
16
,
64
,
32
,
960
},
{
0
,
1
,
2
},
1.0
f
,
0.0
f
);
#endif
#endif
...
...
example/12_reduce/reduce_example_common.hpp
View file @
72c9f129
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -38,7 +38,8 @@ struct ReduceShape
...
@@ -38,7 +38,8 @@ struct ReduceShape
static
constexpr
ck
::
index_t
NumReduceDim_
=
NumReduceDim
;
static
constexpr
ck
::
index_t
NumReduceDim_
=
NumReduceDim
;
};
};
using
reduce_shape_instances
=
std
::
tuple
<
ReduceShape
<
3
,
1
>
,
using
reduce_shape_instances
=
std
::
tuple
<
ReduceShape
<
12
,
3
>
,
ReduceShape
<
3
,
1
>
,
ReduceShape
<
3
,
2
>
,
ReduceShape
<
3
,
2
>
,
ReduceShape
<
4
,
1
>
,
ReduceShape
<
4
,
1
>
,
ReduceShape
<
4
,
2
>
,
ReduceShape
<
4
,
2
>
,
...
...
example/20_grouped_conv_bwd_weight/common.hpp
View file @
72c9f129
...
@@ -23,12 +23,8 @@
...
@@ -23,12 +23,8 @@
using
BF16
=
ck
::
bhalf_t
;
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
#ifdef CK_ENABLE_FP8
using
F8
=
ck
::
f8_t
;
using
F8
=
ck
::
f8_t
;
#endif
#ifdef CK_ENABLE_BF8
using
BF8
=
ck
::
bf8_t
;
using
BF8
=
ck
::
bf8_t
;
#endif
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
...
example/62_convnd_activ/CMakeLists.txt
View file @
72c9f129
...
@@ -3,6 +3,7 @@ add_subdirectory(convinvscale)
...
@@ -3,6 +3,7 @@ add_subdirectory(convinvscale)
add_subdirectory
(
convscale
)
add_subdirectory
(
convscale
)
add_subdirectory
(
convscale_relu
)
add_subdirectory
(
convscale_relu
)
add_subdirectory
(
convscale_add
)
add_subdirectory
(
convscale_add
)
add_subdirectory
(
convscale_reduce
)
add_subdirectory
(
multi_AB
)
add_subdirectory
(
multi_AB
)
add_subdirectory
(
unary
)
add_subdirectory
(
unary
)
...
...
example/62_convnd_activ/convscale_reduce/CMakeLists.txt
0 → 100644
View file @
72c9f129
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_custom_target
(
example_convnd_activ_xdl_convscale_reduce
)
add_example_executable
(
example_convnd_fwd_xdl_convscale_relu_amax_fp8 convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
)
add_example_dependencies
(
example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_relu_amax_fp8
)
add_example_executable
(
example_convnd_fwd_xdl_convscale_amax_fp8 convnd_fwd_xdl_convscale_amax_fp8.cpp
)
add_example_dependencies
(
example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_amax_fp8
)
set
(
target 1
)
endif
()
endforeach
()
example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp
0 → 100644
View file @
72c9f129
This diff is collapsed.
Click to expand it.
example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp
0 → 100644
View file @
72c9f129
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_convscale_reduce_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
using
InDataType
=
ck
::
f8_t
;
using
WeiDataType
=
ck
::
f8_t
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
float
;
using
ConvOutDataType
=
float
;
// data type of convolution result
using
OutDataType
=
ck
::
f8_t
;
// data type of final result
using
AComputeDataType
=
ck
::
f8_t
;
using
BComputeDataType
=
ck
::
f8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
OutElementOp
=
ConvScale
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<>
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
ck
::
Tuple
<>
,
ConvOutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
32
,
// KPerBlock
8
,
// AK1
8
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
8
,
// ABlockTransferSrcScalarPerVector
8
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
8
,
// BBlockTransferSrcScalarPerVector
8
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
AComputeDataType
,
BComputeDataType
>
;
#include "run_convnd_fwd_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
run_convnd_fwd_example
(
argc
,
argv
)
?
0
:
1
;
}
example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp
0 → 100644
View file @
72c9f129
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_convscale_reduce_common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
using
InDataType
=
ck
::
f8_t
;
using
WeiDataType
=
ck
::
f8_t
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
float
;
using
ConvOutDataType
=
float
;
// data type of convolution result
using
OutDataType
=
ck
::
f8_t
;
// data type of final result
using
AComputeDataType
=
ck
::
f8_t
;
using
BComputeDataType
=
ck
::
f8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
OutElementOp
=
ConvScaleRelu
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<>
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
ck
::
Tuple
<>
,
ConvOutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
32
,
// KPerBlock
8
,
// AK1
8
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
8
,
// ABlockTransferSrcScalarPerVector
8
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
8
,
// BBlockTransferSrcScalarPerVector
8
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
AComputeDataType
,
BComputeDataType
>
;
#include "run_convnd_fwd_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
run_convnd_fwd_example
(
argc
,
argv
)
?
0
:
1
;
}
example/62_convnd_activ/convscale_reduce/run_convnd_fwd_example.inc
0 → 100644
View file @
72c9f129
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
bool
run_convnd_fwd_example
(
int
argc
,
char
*
argv
[])
{
print_helper_msg
();
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
ck
::
utils
::
conv
::
ConvParam
conv_param
{
2
,
1
,
128
,
256
,
192
,
{
3
,
3
},
{
71
,
71
},
{
2
,
2
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}};
if
(
argc
==
1
)
{
// use default
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
conv_param
=
ck
::
utils
::
conv
::
parse_conv_param
(
num_dim_spatial
,
5
,
argv
);
}
// instantiate in and wei element ops, will
// instantiate out_element_op below for every iteration
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
run
=
[
&
](
auto
ndim_spatial
,
auto
in_layout
,
auto
wei_layout
,
auto
out_layout
)
{
constexpr
ck
::
index_t
ndim_spatial_value
=
ndim_spatial
.
value
;
using
InLayout
=
decltype
(
in_layout
);
using
WeiLayout
=
decltype
(
wei_layout
);
using
OutLayout
=
decltype
(
out_layout
);
const
auto
in_g_n_c_wis_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
InLayout
>
(
conv_param
);
const
auto
wei_g_k_c_xs_desc
=
ck
::
utils
::
conv
::
make_weight_host_tensor_descriptor_g_k_c_xs_packed
<
WeiLayout
>
(
conv_param
);
const
auto
out_g_n_k_wos_desc
=
ck
::
utils
::
conv
::
make_output_host_tensor_descriptor_g_n_k_wos_packed
<
OutLayout
>
(
conv_param
);
return
run_grouped_conv_fwd
<
ndim_spatial_value
,
InDataType
,
WeiDataType
,
ConvOutDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
DeviceGroupedConvNDFwdInstance
<
ndim_spatial_value
,
InLayout
,
WeiLayout
,
OutLayout
>>
(
do_verification
,
init_method
,
time_kernel
,
conv_param
,
in_g_n_c_wis_desc
,
wei_g_k_c_xs_desc
,
out_g_n_k_wos_desc
,
in_element_op
,
wei_element_op
);
};
namespace
ctc
=
ck
::
tensor_layout
::
convolution
;
if
(
conv_param
.
num_dim_spatial_
==
1
)
{
return
run
(
ck
::
Number
<
1
>
{},
ctc
::
GNWC
{},
ctc
::
GKXC
{},
ctc
::
GNWK
{});
}
else
if
(
conv_param
.
num_dim_spatial_
==
2
)
{
return
run
(
ck
::
Number
<
2
>
{},
ctc
::
GNHWC
{},
ctc
::
GKYXC
{},
ctc
::
GNHWK
{});
}
else
if
(
conv_param
.
num_dim_spatial_
==
3
)
{
return
run
(
ck
::
Number
<
3
>
{},
ctc
::
GNDHWC
{},
ctc
::
GKZYXC
{},
ctc
::
GNDHWK
{});
}
return
true
;
}
example/65_gemm_multiply_multiply/gemm_add_add_xdl_fp16.cpp
View file @
72c9f129
...
@@ -208,6 +208,7 @@ int main(int argc, char* argv[])
...
@@ -208,6 +208,7 @@ int main(int argc, char* argv[])
StrideB
,
StrideB
,
std
::
array
<
ck
::
index_t
,
NumDTensor
>
{
StrideD
,
StrideD
},
std
::
array
<
ck
::
index_t
,
NumDTensor
>
{
StrideD
,
StrideD
},
StrideE
,
StrideE
,
1
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
cde_element_op
);
cde_element_op
);
...
...
example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8.cpp
View file @
72c9f129
...
@@ -69,7 +69,7 @@ using AElementOp = PassThrough;
...
@@ -69,7 +69,7 @@ using AElementOp = PassThrough;
using
BElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
MultiplyMultiply
;
using
CDEElementOp
=
MultiplyMultiply
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
;
using
DeviceOpInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultiD_Xdl_CShuffle_V3
using
DeviceOpInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultiD_Xdl_CShuffle_V3
// clang-format off
// clang-format off
...
@@ -99,6 +99,8 @@ int main(int argc, char* argv[])
...
@@ -99,6 +99,8 @@ int main(int argc, char* argv[])
ck
::
index_t
StrideD
=
0
;
ck
::
index_t
StrideD
=
0
;
ck
::
index_t
StrideE
=
N
;
ck
::
index_t
StrideE
=
N
;
ck
::
index_t
KBatch
=
1
;
if
(
argc
==
1
)
if
(
argc
==
1
)
{
{
// use default case
// use default case
...
@@ -109,7 +111,7 @@ int main(int argc, char* argv[])
...
@@ -109,7 +111,7 @@ int main(int argc, char* argv[])
init_method
=
std
::
stoi
(
argv
[
2
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
}
else
if
(
argc
==
1
1
)
else
if
(
argc
==
1
2
)
{
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
...
@@ -123,13 +125,16 @@ int main(int argc, char* argv[])
...
@@ -123,13 +125,16 @@ int main(int argc, char* argv[])
StrideB
=
std
::
stoi
(
argv
[
8
]);
StrideB
=
std
::
stoi
(
argv
[
8
]);
StrideD
=
std
::
stoi
(
argv
[
9
]);
StrideD
=
std
::
stoi
(
argv
[
9
]);
StrideE
=
std
::
stoi
(
argv
[
10
]);
StrideE
=
std
::
stoi
(
argv
[
10
]);
KBatch
=
std
::
stoi
(
argv
[
11
]);
}
}
else
else
{
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE
\n
"
);
printf
(
"arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, KBatch
\n
"
);
exit
(
0
);
exit
(
0
);
}
}
...
@@ -212,6 +217,7 @@ int main(int argc, char* argv[])
...
@@ -212,6 +217,7 @@ int main(int argc, char* argv[])
StrideB
,
StrideB
,
std
::
array
<
ck
::
index_t
,
NumDTensor
>
{
I0
,
I0
},
std
::
array
<
ck
::
index_t
,
NumDTensor
>
{
I0
,
I0
},
StrideE
,
StrideE
,
KBatch
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
cde_element_op
);
cde_element_op
);
...
@@ -236,10 +242,12 @@ int main(int argc, char* argv[])
...
@@ -236,10 +242,12 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
<<
std
::
endl
;
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
mData
.
data
());
if
(
do_verification
)
if
(
do_verification
)
{
{
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
false
});
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
mData
.
data
());
Tensor
<
CShuffleDataType
>
c_m_n
({
M
,
N
});
Tensor
<
CShuffleDataType
>
c_m_n
({
M
,
N
});
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
A0DataType
,
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
A0DataType
,
...
...
example/CMakeLists.txt
View file @
72c9f129
...
@@ -72,6 +72,20 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
...
@@ -72,6 +72,20 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
list
(
REMOVE_ITEM FILE_NAME
"
${
source
}
"
)
list
(
REMOVE_ITEM FILE_NAME
"
${
source
}
"
)
endif
()
endif
()
endforeach
()
endforeach
()
#Do not build any FP8 examples if CK_ENABLE_FP8 not set
foreach
(
source IN LISTS FILE_NAME
)
if
(
NOT DEFINED CK_ENABLE_FP8 AND source MATCHES
"_fp8"
)
message
(
"removing fp8 example
${
source
}
"
)
list
(
REMOVE_ITEM FILE_NAME
"
${
source
}
"
)
endif
()
endforeach
()
#Do not build any BF8 examples if CK_ENABLE_BF8 not set
foreach
(
source IN LISTS FILE_NAME
)
if
(
NOT DEFINED CK_ENABLE_BF8 AND source MATCHES
"_bf8"
)
message
(
"removing bf8 example
${
source
}
"
)
list
(
REMOVE_ITEM FILE_NAME
"
${
source
}
"
)
endif
()
endforeach
()
#only continue if there are some source files left on the list
#only continue if there are some source files left on the list
if
(
FILE_NAME
)
if
(
FILE_NAME
)
if
(
FILE_NAME MATCHES
"_xdl"
)
if
(
FILE_NAME MATCHES
"_xdl"
)
...
...
example/ck_tile/01_fmha/CMakeLists.txt
View file @
72c9f129
# generate a list of kernels, but not actually emit files at config stage
# validate user-specified fmha_fwd API list
set
(
FMHA_FWD_KNOWN_APIS
"fwd;fwd_splitkv;fwd_appendkv"
)
set
(
FMHA_FWD_ENABLE_APIS
"fwd"
CACHE STRING
"semicolon-separated list of APIs to generate (
${
FMHA_FWD_KNOWN_APIS
}
) & link, or
\"
all
\"
."
)
if
(
FMHA_FWD_ENABLE_APIS STREQUAL
"all"
)
set
(
FMHA_FWD_ENABLE_APIS
${
FMHA_FWD_KNOWN_APIS
}
)
endif
()
foreach
(
api
${
FMHA_FWD_ENABLE_APIS
}
)
if
(
NOT
"
${
api
}
"
IN_LIST FMHA_FWD_KNOWN_APIS
)
message
(
FATAL_ERROR
"
${
api
}
isn't a known api:
${
FMHA_FWD_KNOWN_APIS
}
."
)
endif
()
endforeach
()
# "fwd" is a must-have api for the fmha_fwd example, add it if not specified
if
(
NOT
"fwd"
IN_LIST FMHA_FWD_ENABLE_APIS
)
list
(
APPEND FMHA_FWD_ENABLE_APIS
"fwd"
)
endif
()
string
(
REPLACE
";"
","
FMHA_FWD_APIS
"
${
FMHA_FWD_ENABLE_APIS
}
"
)
# generate a list of kernels, but not actually emit files at config sta
execute_process
(
execute_process
(
COMMAND
${
Python3_EXECUTABLE
}
${
CMAKE_CURRENT_LIST_DIR
}
/generate.py
COMMAND
${
Python3_EXECUTABLE
}
${
CMAKE_CURRENT_LIST_DIR
}
/generate.py
--api fwd,fwd_splitkv --list_blobs
${
CMAKE_CURRENT_BINARY_DIR
}
/fwd_blob_list.txt
--api
${
FMHA_FWD_APIS
}
--list_blobs
${
CMAKE_CURRENT_BINARY_DIR
}
/fwd_blob_list.txt
RESULT_VARIABLE ret
)
)
if
(
ret AND NOT ret EQUAL 0
)
message
(
FATAL_ERROR
"CK Tile FMHA FAILED to genrate a list of FWD kernels via Python."
)
endif
()
execute_process
(
execute_process
(
COMMAND
${
Python3_EXECUTABLE
}
${
CMAKE_CURRENT_LIST_DIR
}
/generate.py
COMMAND
${
Python3_EXECUTABLE
}
${
CMAKE_CURRENT_LIST_DIR
}
/generate.py
--api bwd --list_blobs
${
CMAKE_CURRENT_BINARY_DIR
}
/bwd_blob_list.txt
--api bwd --list_blobs
${
CMAKE_CURRENT_BINARY_DIR
}
/bwd_blob_list.txt --receipt 3
RESULT_VARIABLE ret
)
)
if
(
ret AND NOT ret EQUAL 0
)
message
(
FATAL_ERROR
"CK Tile FMHA FAILED to genrate a list of BWD kernels via Python."
)
endif
()
# NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
# NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
# as current cmake list, otherwise will not figure out the dependency properly
# as current cmake list, otherwise will not figure out the dependency properly
...
@@ -17,13 +45,13 @@ file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt FMHA_BWD_GEN_BLOBS)
...
@@ -17,13 +45,13 @@ file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt FMHA_BWD_GEN_BLOBS)
add_custom_command
(
add_custom_command
(
OUTPUT
${
FMHA_FWD_GEN_BLOBS
}
OUTPUT
${
FMHA_FWD_GEN_BLOBS
}
COMMAND
${
Python3_EXECUTABLE
}
${
CMAKE_CURRENT_LIST_DIR
}
/generate.py
COMMAND
${
Python3_EXECUTABLE
}
${
CMAKE_CURRENT_LIST_DIR
}
/generate.py
--api
fwd,fwd_splitkv
--output_dir
${
CMAKE_CURRENT_BINARY_DIR
}
--api
${
FMHA_FWD_APIS
}
--output_dir
${
CMAKE_CURRENT_BINARY_DIR
}
)
)
add_custom_command
(
add_custom_command
(
OUTPUT
${
FMHA_BWD_GEN_BLOBS
}
OUTPUT
${
FMHA_BWD_GEN_BLOBS
}
COMMAND
${
Python3_EXECUTABLE
}
${
CMAKE_CURRENT_LIST_DIR
}
/generate.py
COMMAND
${
Python3_EXECUTABLE
}
${
CMAKE_CURRENT_LIST_DIR
}
/generate.py
--api bwd --output_dir
${
CMAKE_CURRENT_BINARY_DIR
}
--api bwd --output_dir
${
CMAKE_CURRENT_BINARY_DIR
}
--receipt 3
)
)
set
(
EXAMPLE_FMHA_FWD
"tile_example_fmha_fwd"
)
set
(
EXAMPLE_FMHA_FWD
"tile_example_fmha_fwd"
)
...
@@ -55,10 +83,23 @@ set(EXAMPLE_FMHA_BWD_COMPILE_OPTIONS)
...
@@ -55,10 +83,23 @@ set(EXAMPLE_FMHA_BWD_COMPILE_OPTIONS)
# ... because they are auto-generated
# ... because they are auto-generated
if
(
FMHA_FWD_FAST_EXP2
)
if
(
FMHA_FWD_FAST_EXP2
)
list
(
APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero
)
list
(
APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero
)
list
(
APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero
)
else
()
else
()
list
(
APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0
)
list
(
APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0
)
list
(
APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0
)
endif
()
list
(
APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -fgpu-flush-denormals-to-zero
)
# conditionally enable call to the fwd_splitkv API in fmha_fwd example
if
(
"fwd_splitkv"
IN_LIST FMHA_FWD_ENABLE_APIS
)
list
(
APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=1
)
else
()
list
(
APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=0
)
endif
()
# conditionally enable call to the fwd_appendkv API in fmha_fwd example
if
(
"fwd_appendkv"
IN_LIST FMHA_FWD_ENABLE_APIS
)
list
(
APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=1
)
else
()
list
(
APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=0
)
endif
()
endif
()
# Allow comparing floating points directly in order to check sentinel values
# Allow comparing floating points directly in order to check sentinel values
...
...
example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
View file @
72c9f129
...
@@ -66,6 +66,34 @@ BIAS_CHECK_MAP = {
...
@@ -66,6 +66,34 @@ BIAS_CHECK_MAP = {
"alibi"
:
"bias_enum::alibi"
"alibi"
:
"bias_enum::alibi"
}
}
DROPOUT_MAP
=
{
"no"
:
"ck_tile::BlockDropoutBwd<false, true, false>"
,
"dropout_wg32"
:
"ck_tile::BlockDropoutBwd<true, true, false>"
,
"dropout_wg32_storerandval"
:
"ck_tile::BlockDropoutBwd<true, true, true >"
,
"dropout_wg16"
:
"ck_tile::BlockDropoutBwd<true, false, false>"
,
"dropout_wg16_storerandval"
:
"ck_tile::BlockDropoutBwd<true, false, true >"
}
DROPOUT_CHECK_MAP
=
{
"no"
:
"t.has_dropout == false"
,
"dropout_wg32"
:
"t.has_dropout == true && t.is_store_randval == false"
,
"dropout_wg32_storerandval"
:
"t.has_dropout == true && t.is_store_randval == true"
,
"dropout_wg16"
:
"t.has_dropout == true && t.is_store_randval == false"
,
"dropout_wg16_storerandval"
:
"t.has_dropout == true && t.is_store_randval == true"
,
}
ROPE_MAP
=
{
"no"
:
"ck_tile::RotaryEmbeddingEnum::NONE"
,
"inter"
:
"ck_tile::RotaryEmbeddingEnum::INTERLEAVED"
,
"half"
:
"ck_tile::RotaryEmbeddingEnum::HALF_ROTATED"
}
ROPE_CHECK_MAP
=
{
"no"
:
"rope_enum::none"
,
"inter"
:
"rope_enum::interleaved"
,
"half"
:
"rope_enum::half_rotated"
}
MODE_MAP
=
{
MODE_MAP
=
{
"batch"
:
"false"
,
"batch"
:
"false"
,
"group"
:
"true"
"group"
:
"true"
...
...
example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
View file @
72c9f129
This diff is collapsed.
Click to expand it.
example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
0 → 100644
View file @
72c9f129
# SPDX-License-Identifier: MIT
# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
# generate kernel instances to speed up compilation
import
copy
from
dataclasses
import
dataclass
import
fnmatch
import
itertools
from
pathlib
import
Path
from
typing
import
List
,
Optional
,
Tuple
from
codegen.cmake_config
import
*
from
codegen.cpp_symbol_map
import
*
from
codegen.ops.fmha_fwd
import
(
FmhaFwdApiTrait
,
DTYPE_BITS
,
FMHA_FWD_KERNEL_HEADER
,
FMHA_FWD_API_PER_DTYPE
,
FMHA_FWD_API_PER_HDIM_CASE
,
)
FMHA_FWD_APPENDKV_KERNEL_BODY
=
"""
using fmha_dtype_{F_idx} = {F_dtype};
using fmha_trait_{F_idx} = ck_tile::TileFmhaFwdAppendKVTraits<{F_spad},
{F_skpad},
{F_dpad},
{F_dvpad},
{F_occupancy}>;
using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaFwdAppendKVPipelineProblem<
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::QDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::KDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::VDataType,
{F_bs},
{F_bsk},
{F_bd},
{F_bdv},
{F_vlayout},
{F_rope},
{F_pagedkv},
fmha_trait_{F_idx}>;
using fmha_pipeline_{F_idx} = ck_tile::BlockFmhaFwdAppendKVPipeline<
fmha_pipeline_problem_{F_idx}>;
using fmha_kernel_{F_idx} =
ck_tile::FmhaFwdAppendKVKernel<ck_tile::FmhaFwdAppendKVTilePartitioner<{F_bs}, {F_bsk}, {F_bd}, {F_bdv}>,
fmha_pipeline_{F_idx}>;
using trait_{F_idx} = fmha_fwd_appendkv_traits_<{F_hdim}, {F_dtype}, {F_bs}, {F_bsk}, {F_bd}, {F_bdv}, {F_vlayout},
{F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_rope}, {F_pagedkv}>;
#include <iostream>
template<>
float fmha_fwd_appendkv_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_appendkv_args a)
{{
using k_ = fmha_kernel_{F_idx};
if(s.log_level_ > 0)
std::cout << ", " << k_::GetName() << std::flush;
auto [kargs, grids] = fmha_fwd_appendkv_create_kargs_and_grids<k_>(a);
constexpr dim3 blocks = k_::BlockSize();
constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
return ck_tile::launch_kernel(s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
}}
"""
FMHA_FWD_APPENDKV_API_FILENAME
=
"fmha_fwd_appendkv_api.cpp"
FMHA_FWD_APPENDKV_API
=
"""
float fmha_fwd_appendkv(fmha_fwd_appendkv_traits t, fmha_fwd_appendkv_args a, const ck_tile::stream_config& s){{
float r = -1;
{F_dispatch}
return r;
}}
"""
FMHA_FWD_APPENDKV_API_INNER_DISPATCH
=
""" {F_if}((t.is_v_rowmajor == {F_vlayout}) &&
({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.rope_type == {F_rope_check}) &&
((a.block_table_ptr != nullptr) == {F_pagedkv})) {{
using trait_ = fmha_fwd_appendkv_traits_<{F_hdim}, {F_dtype}, {F_bs}, {F_bsk}, {F_bd}, {F_bdv}, {F_vlayout}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_rope}, {F_pagedkv}>;
return fmha_fwd_appendkv_<trait_>(s, a);
}}
"""
@
dataclass
class
FmhaFwdAppendKVApiTrait
:
# sync with fmha_fwd_traits<>, to generate fallback calls
hdim
:
str
dtype
:
str
# data type
bs
:
int
# tile size along q seqlen
bsk
:
int
# tile size along k seqlen
bd
:
int
# tile size along qk gemm unroll
bdv
:
int
# tile size along kv gemm unroll
vlayout
:
str
spad
:
str
skpad
:
str
dpad
:
str
dvpad
:
str
rope
:
str
# key from ROPE_MAP
pagedkv
:
str
@
property
def
name
(
self
)
->
str
:
return
f
'
{
self
.
hdim
}
-
{
self
.
dtype
}
-
{
self
.
bs
}
-
{
self
.
bsk
}
-
{
self
.
bd
}
-
{
self
.
bdv
}
-
{
self
.
vlayout
}
-'
+
\
f
'
{
self
.
spad
}
-
{
self
.
skpad
}
-
{
self
.
dpad
}
-
{
self
.
dvpad
}
-
{
self
.
rope
}
-
{
self
.
pagedkv
}
'
@
property
def
scheck
(
self
)
->
str
:
if
self
.
spad
==
't'
:
return
f
'true /*a.seqlen_q %
{
self
.
bs
}
!= 0*/'
else
:
return
f
'a.seqlen_q %
{
self
.
bs
}
== 0'
@
property
def
skcheck
(
self
)
->
str
:
# we do not check all the values in a.seqlen_k_ptr
return
'true'
@
property
def
dcheck
(
self
)
->
str
:
if
self
.
dpad
==
't'
:
return
f
'true /*a.hdim_q %
{
self
.
bd
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
else
:
return
f
'a.hdim_q %
{
self
.
bd
}
== 0'
@
property
def
dvcheck
(
self
)
->
str
:
if
self
.
dvpad
==
't'
:
return
f
'true /*a.hdim_v %
{
self
.
bdv
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
else
:
return
f
'a.hdim_v %
{
self
.
bdv
}
== 0'
@
dataclass
class
FmhaFwdAppendKVPipeline
:
F_vlayout
:
str
# row/col
F_spad
:
str
# true/false
F_skpad
:
str
#
F_dpad
:
str
#
F_dvpad
:
str
#
F_rope
:
str
# key from ROPE_MAP
F_pagedkv
:
str
# t/f
@
property
def
name
(
self
)
->
str
:
def
pad_name
()
->
str
:
n
=
''
if
self
.
F_spad
==
't'
:
n
+=
's'
if
self
.
F_skpad
==
't'
:
n
+=
'sk'
if
self
.
F_dpad
==
't'
:
n
+=
'd'
if
self
.
F_dvpad
==
't'
:
n
+=
'dv'
if
n
!=
''
:
n
=
'p'
+
n
return
n
pn
=
pad_name
()
n
=
f
'v
{
self
.
F_vlayout
[
0
]
}
'
if
pn
!=
''
:
n
+=
f
'_
{
pn
}
'
if
self
.
F_rope
!=
'no'
:
n
+=
f
'_
{
self
.
F_rope
}
'
if
self
.
F_pagedkv
==
't'
:
n
+=
'_pagedkv'
return
n
class
FmhaFwdAppendKVApiPool
:
def
__init__
(
self
,
mask_impl
):
self
.
pool
=
dict
()
self
.
mask_impl
=
mask_impl
def
register_traits
(
self
,
trait
:
FmhaFwdApiTrait
)
->
None
:
# TODO: do we need to check duplication?
if
trait
.
dtype
not
in
self
.
pool
.
keys
():
self
.
pool
[
trait
.
dtype
]
=
dict
()
if
trait
.
hdim
not
in
self
.
pool
[
trait
.
dtype
].
keys
():
self
.
pool
[
trait
.
dtype
][
trait
.
hdim
]
=
list
()
self
.
pool
[
trait
.
dtype
][
trait
.
hdim
].
append
(
copy
.
copy
(
trait
))
@
property
def
api
(
self
)
->
str
:
per_dtypes
=
str
()
for
i
,
dtype
in
enumerate
(
self
.
pool
.
keys
()):
per_hdim_case
=
str
()
for
j
,
hdim
in
enumerate
(
self
.
pool
[
dtype
].
keys
()):
traits
=
self
.
pool
[
dtype
][
hdim
]
inners
=
str
()
for
k
,
trait
in
enumerate
(
traits
):
if_k
=
'if'
if
k
==
0
else
'else if'
inners
=
inners
+
FMHA_FWD_APPENDKV_API_INNER_DISPATCH
.
format
(
F_if
=
if_k
,
F_vlayout
=
LAYOUT_MAP
[
trait
.
vlayout
],
F_scheck
=
trait
.
scheck
,
F_skcheck
=
trait
.
skcheck
,
F_dcheck
=
trait
.
dcheck
,
F_dvcheck
=
trait
.
dvcheck
,
F_rope_check
=
ROPE_CHECK_MAP
[
trait
.
rope
],
F_pagedkv
=
BOOL_MAP
[
trait
.
pagedkv
],
F_spad
=
BOOL_MAP
[
trait
.
spad
],
F_skpad
=
BOOL_MAP
[
trait
.
skpad
],
F_dpad
=
BOOL_MAP
[
trait
.
dpad
],
F_dvpad
=
BOOL_MAP
[
trait
.
dvpad
],
F_rope
=
ROPE_MAP
[
trait
.
rope
],
F_bs
=
trait
.
bs
,
F_bsk
=
trait
.
bsk
,
F_bd
=
trait
.
bd
,
F_bdv
=
trait
.
bdv
,
F_hdim
=
hdim
,
F_dtype
=
DTYPE_MAP
[
dtype
])
if_j
=
'if'
if
j
==
0
else
'else if'
per_hdim_case
=
per_hdim_case
+
FMHA_FWD_API_PER_HDIM_CASE
.
format
(
F_if
=
if_j
,
F_hdim
=
hdim
,
F_inner_dispatch
=
inners
)
if_i
=
'if'
if
i
==
0
else
'else if'
per_dtypes
=
per_dtypes
+
FMHA_FWD_API_PER_DTYPE
.
format
(
F_if
=
if_i
,
F_dtype
=
dtype
,
F_hdim_case
=
per_hdim_case
)
return
FMHA_FWD_KERNEL_HEADER
+
FMHA_FWD_APPENDKV_API
.
format
(
F_dispatch
=
per_dtypes
)
@
dataclass
class
FmhaFwdAppendKVTileSize
:
F_bs
:
int
# tile size along q seqlen
F_bsk
:
int
# tile size along k seqlen
F_bd
:
int
# tile size along qk gemm unroll
F_bdv
:
int
# tile size along kv gemm unroll
F_occupancy
:
int
# occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
@
property
def
name
(
self
)
->
str
:
return
f
"b
{
self
.
F_bs
}
x
{
self
.
F_bsk
}
x
{
self
.
F_bd
}
x
{
self
.
F_bdv
}
"
+
\
(
""
if
self
.
F_occupancy
==
-
1
else
f
"_o
{
self
.
F_occupancy
}
"
)
@
dataclass
class
FmhaFwdAppendKVKernel
:
F_idx
:
int
# this is not a tunable, but a counter to differentiate symbol
F_hdim
:
int
# hdim
F_dtype
:
str
# data type
F_tile
:
FmhaFwdAppendKVTileSize
F_pipeline
:
FmhaFwdAppendKVPipeline
mask_impl
:
str
@
property
def
template
(
self
)
->
str
:
kernel_body
=
str
()
return
FMHA_FWD_KERNEL_HEADER
+
\
FMHA_FWD_APPENDKV_KERNEL_BODY
.
format
(
F_idx
=
self
.
F_idx
,
F_hdim
=
self
.
F_hdim
,
F_dtype
=
DTYPE_MAP
[
self
.
F_dtype
],
F_bs
=
self
.
F_tile
.
F_bs
,
F_bsk
=
self
.
F_tile
.
F_bsk
,
F_bd
=
self
.
F_tile
.
F_bd
,
F_bdv
=
self
.
F_tile
.
F_bdv
,
F_vlayout
=
LAYOUT_MAP
[
self
.
F_pipeline
.
F_vlayout
],
F_spad
=
BOOL_MAP
[
self
.
F_pipeline
.
F_spad
],
F_skpad
=
BOOL_MAP
[
self
.
F_pipeline
.
F_skpad
],
F_dpad
=
BOOL_MAP
[
self
.
F_pipeline
.
F_dpad
],
F_dvpad
=
BOOL_MAP
[
self
.
F_pipeline
.
F_dvpad
],
F_rope
=
ROPE_MAP
[
self
.
F_pipeline
.
F_rope
],
F_pagedkv
=
BOOL_MAP
[
self
.
F_pipeline
.
F_pagedkv
],
F_occupancy
=
self
.
F_tile
.
F_occupancy
)
@
property
def
name
(
self
)
->
str
:
# TODO: we don't encode idx here
return
f
"fmha_fwd_appendkv_d
{
self
.
F_hdim
}
_
{
self
.
F_dtype
}
_"
+
\
self
.
F_tile
.
name
+
'_'
+
self
.
F_pipeline
.
name
@
property
def
filename
(
self
)
->
str
:
return
self
.
name
+
".cpp"
def
api_trait
(
self
)
->
FmhaFwdAppendKVApiTrait
:
return
FmhaFwdAppendKVApiTrait
(
hdim
=
str
(
self
.
F_hdim
),
dtype
=
self
.
F_dtype
,
bs
=
self
.
F_tile
.
F_bs
,
bsk
=
self
.
F_tile
.
F_bsk
,
bd
=
self
.
F_tile
.
F_bd
,
bdv
=
self
.
F_tile
.
F_bdv
,
vlayout
=
self
.
F_pipeline
.
F_vlayout
,
spad
=
self
.
F_pipeline
.
F_spad
,
skpad
=
self
.
F_pipeline
.
F_skpad
,
dpad
=
self
.
F_pipeline
.
F_dpad
,
dvpad
=
self
.
F_pipeline
.
F_dvpad
,
rope
=
self
.
F_pipeline
.
F_rope
,
pagedkv
=
self
.
F_pipeline
.
F_pagedkv
)
# TODO: design a more practical way to do it
# this is current supported tile size per hdim
def
get_fmha_fwd_appendkv_tile_dict_from_dtype
(
dtype
:
str
)
->
Optional
[
dict
]:
if
dtype
==
'fp16'
or
dtype
==
'bf16'
:
return
{
'32'
:
FmhaFwdAppendKVTileSize
(
64
,
64
,
32
,
32
,
-
1
),
'64'
:
FmhaFwdAppendKVTileSize
(
64
,
64
,
64
,
64
,
-
1
),
'128'
:
FmhaFwdAppendKVTileSize
(
64
,
64
,
128
,
128
,
-
1
),
'256'
:
FmhaFwdAppendKVTileSize
(
64
,
64
,
256
,
256
,
-
1
),
}
elif
dtype
==
'fp8'
or
dtype
==
'bf8'
:
return
{
'64'
:
FmhaFwdAppendKVTileSize
(
64
,
64
,
64
,
64
,
-
1
),
'128'
:
FmhaFwdAppendKVTileSize
(
64
,
64
,
128
,
128
,
-
1
),
'256'
:
FmhaFwdAppendKVTileSize
(
64
,
64
,
256
,
256
,
-
1
)
}
else
:
return
None
def
get_fwd_appendkv_blobs
(
kernel_filter
:
Optional
[
str
],
receipt
,
mask_impl
)
->
Tuple
[
FmhaFwdAppendKVApiPool
,
List
[
FmhaFwdAppendKVKernel
]]:
# TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
# support this in future
def
get_pipelines
(
dtype
,
hdim
)
->
List
[
FmhaFwdAppendKVPipeline
]:
# this function will populate a list possible pipelines
# TODO: the order of List matters! the later in this list will be also be checked later
# TODO: currently for qr pipeline, let 't' padding to appear later!!
# TODO: how to design this more generic?
squant
=
't'
if
dtype
==
'fp8'
else
'f'
pipelines
=
[]
if
dtype
in
[
'fp16'
,
'bf16'
]:
# NOTICE: it will be very complicated if we consider all the hdim_q padding cases while
# applying rotary embedding, so I just use 't' in inter/half pipelines
for
vlayout
in
[
'row'
,
'col'
]:
for
pagedkv
in
[
"t"
,
"f"
]:
pipelines
.
append
(
FmhaFwdAppendKVPipeline
(
vlayout
,
'f'
,
't'
,
'f'
,
'f'
,
'no'
,
pagedkv
))
pipelines
.
append
(
FmhaFwdAppendKVPipeline
(
vlayout
,
't'
,
't'
,
't'
,
't'
,
'no'
,
pagedkv
))
pipelines
.
append
(
FmhaFwdAppendKVPipeline
(
vlayout
,
'f'
,
't'
,
't'
,
'f'
,
'inter'
,
pagedkv
))
pipelines
.
append
(
FmhaFwdAppendKVPipeline
(
vlayout
,
't'
,
't'
,
't'
,
't'
,
'inter'
,
pagedkv
))
pipelines
.
append
(
FmhaFwdAppendKVPipeline
(
vlayout
,
'f'
,
't'
,
't'
,
'f'
,
'half'
,
pagedkv
))
pipelines
.
append
(
FmhaFwdAppendKVPipeline
(
vlayout
,
't'
,
't'
,
't'
,
't'
,
'half'
,
pagedkv
))
elif
dtype
in
[
'fp8'
,
'bf8'
]:
# rope/paged-kv is not supported
pipelines
.
append
(
FmhaFwdAppendKVPipeline
(
'col'
,
't'
,
't'
,
't'
,
't'
,
'no'
,
'f'
))
else
:
assert
False
return
pipelines
gen
=
list
()
api_pool
=
FmhaFwdAppendKVApiPool
(
mask_impl
)
for
dtype
in
DTYPE_MAP
.
keys
():
d
=
get_fmha_fwd_appendkv_tile_dict_from_dtype
(
dtype
)
if
d
==
None
:
continue
for
hdim_str
in
d
.
keys
():
tile
=
d
[
hdim_str
]
hdim
=
int
(
hdim_str
)
for
pipeline
in
get_pipelines
(
dtype
,
hdim
):
k
=
FmhaFwdAppendKVKernel
(
F_idx
=
0
,
F_hdim
=
hdim
,
F_dtype
=
dtype
,
F_tile
=
tile
,
F_pipeline
=
pipeline
,
mask_impl
=
mask_impl
)
if
kernel_filter
!=
None
:
if
not
fnmatch
.
fnmatch
(
k
.
name
,
kernel_filter
):
continue
if
receipt
==
2
:
cond
=
dtype
in
[
'fp16'
,
'bf16'
]
cond
&=
pipeline
.
F_vlayout
==
'row'
if
not
cond
:
continue
api_pool
.
register_traits
(
k
.
api_trait
())
gen
.
append
(
k
)
return
(
api_pool
,
gen
)
def
write_single_kernel
(
kernel
:
FmhaFwdAppendKVKernel
,
autogen_dir
:
Path
)
->
None
:
(
autogen_dir
/
kernel
.
filename
).
write_text
(
kernel
.
template
)
def
write_fwd_appendkv_api
(
api_pool
:
FmhaFwdAppendKVApiPool
,
autogen_dir
:
Path
)
->
None
:
(
autogen_dir
/
FMHA_FWD_APPENDKV_API_FILENAME
).
write_text
(
api_pool
.
api
)
def
write_blobs
(
output_dir
:
Path
,
kernel_filter
:
Optional
[
str
],
receipt
,
mask_impl
)
->
None
:
api_pool
,
kernels
=
get_fwd_appendkv_blobs
(
kernel_filter
,
receipt
,
mask_impl
)
for
kernel
in
kernels
:
write_single_kernel
(
kernel
,
output_dir
)
write_fwd_appendkv_api
(
api_pool
,
output_dir
)
def
list_blobs
(
file_path
:
Path
,
kernel_filter
:
Optional
[
str
],
receipt
,
mask_impl
)
->
None
:
with
file_path
.
open
(
'a'
)
as
f
:
_
,
kernels
=
get_fwd_appendkv_blobs
(
kernel_filter
,
receipt
,
mask_impl
)
for
kernel
in
kernels
:
f
.
write
(
str
(
file_path
.
parent
/
GEN_DIR
/
kernel
.
filename
)
+
"
\n
"
)
f
.
write
(
str
(
file_path
.
parent
/
GEN_DIR
/
FMHA_FWD_APPENDKV_API_FILENAME
)
+
"
\n
"
)
\ No newline at end of file
example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
View file @
72c9f129
...
@@ -21,6 +21,14 @@ from codegen.ops.fmha_fwd import (
...
@@ -21,6 +21,14 @@ from codegen.ops.fmha_fwd import (
)
)
DTYPE_BITS
=
{
"fp32"
:
32
,
"fp16"
:
16
,
"bf16"
:
16
,
"fp8"
:
8
,
"bf8"
:
8
}
FMHA_FWD_SPLITKV_PIPELINE_MAP
=
{
FMHA_FWD_SPLITKV_PIPELINE_MAP
=
{
"qr"
:
"ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS"
,
"qr"
:
"ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS"
,
"qr_async"
:
"ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync"
,
"qr_async"
:
"ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync"
,
...
@@ -51,8 +59,8 @@ using fmha_trait = ck_tile::TileFmhaFwdSplitKVTraits<{F_spad},
...
@@ -51,8 +59,8 @@ using fmha_trait = ck_tile::TileFmhaFwdSplitKVTraits<{F_spad},
{F_bias},
{F_bias},
false,
false,
{F_lse},
{F_lse},
{F_dropout},
{F_squant},
{F_squant},
{F_pagedkv},
kHasUnevenSplits,
kHasUnevenSplits,
{F_occupancy}>;
{F_occupancy}>;
...
@@ -63,7 +71,6 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaFwdSplitKVPipelineProblem<
...
@@ -63,7 +71,6 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaFwdSplitKVPipelineProblem<
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SaccDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SaccDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SMPLComputeDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SMPLComputeDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::BiasDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::BiasDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::RandValOutputDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::LSEDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::LSEDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::PDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::PDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
...
@@ -86,7 +93,7 @@ using fmha_kernel =
...
@@ -86,7 +93,7 @@ using fmha_kernel =
fmha_pipeline,
fmha_pipeline,
fmha_epilogue>;
fmha_epilogue>;
static void run(const ck_tile::stream_config& s, fmha_fwd_args a)
static void run(const ck_tile::stream_config& s, fmha_fwd_
splitkv_
args a)
{{
{{
using k_ = fmha_kernel;
using k_ = fmha_kernel;
auto [kargs, grids] = fmha_fwd_splitkv_create_kargs_and_grids<k_>(a);
auto [kargs, grids] = fmha_fwd_splitkv_create_kargs_and_grids<k_>(a);
...
@@ -97,16 +104,21 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_args a)
...
@@ -97,16 +104,21 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_args a)
}};
}};
}}
}}
using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout},
using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout},
{F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
{F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad},
{F_dvpad}>;
#include <iostream>
#include <iostream>
template<>
template<>
void fmha_fwd_splitkv_oneshot_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_args a)
void fmha_fwd_splitkv_oneshot_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_
splitkv_
args a)
{{
{{
if constexpr({F_mode} == false) {{ // batch mode
if constexpr({F_mode} == false) {{ // batch mode
if (a.seqlen_k % (a.num_splits * {F_bn0}) == 0) {{
// we don't check every seqlen_k values for kvcache
if (a.seqlen_k_ptr != nullptr) {{
kernel_runner<true>::run(s, a);
// make sure F_bn0 is divisible by F_bk1
}} else if (a.seqlen_k % (a.num_splits * {F_bn0}) == 0) {{
kernel_runner<false>::run(s, a);
kernel_runner<false>::run(s, a);
}} else {{
}} else {{
kernel_runner<true>::run(s, a);
kernel_runner<true>::run(s, a);
...
@@ -160,7 +172,7 @@ using fmha_kernel =
...
@@ -160,7 +172,7 @@ using fmha_kernel =
fmha_pipeline,
fmha_pipeline,
fmha_epilogue>;
fmha_epilogue>;
static void run(const ck_tile::stream_config& s, fmha_fwd_args a)
static void run(const ck_tile::stream_config& s, fmha_fwd_
splitkv_
args a)
{{
{{
using k_ = fmha_kernel;
using k_ = fmha_kernel;
auto [kargs, grids] = fmha_fwd_splitkv_combine_create_kargs_and_grids<k_>(a);
auto [kargs, grids] = fmha_fwd_splitkv_combine_create_kargs_and_grids<k_>(a);
...
@@ -177,7 +189,7 @@ using trait_{F_idx} = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_m
...
@@ -177,7 +189,7 @@ using trait_{F_idx} = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_m
#include <iostream>
#include <iostream>
template<>
template<>
void fmha_fwd_splitkv_combine_oneshot_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_args a)
void fmha_fwd_splitkv_combine_oneshot_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_
splitkv_
args a)
{{
{{
if (a.num_splits <= 16) {{
if (a.num_splits <= 16) {{
kernel_runner<4>::run(s, a);
kernel_runner<4>::run(s, a);
...
@@ -203,7 +215,7 @@ FMHA_FWD_SPLITKV_API="""
...
@@ -203,7 +215,7 @@ FMHA_FWD_SPLITKV_API="""
#include <iostream>
#include <iostream>
template<typename fmha_fwd_splitkv_traits_, typename fmha_fwd_splitkv_combine_traits_>
template<typename fmha_fwd_splitkv_traits_, typename fmha_fwd_splitkv_combine_traits_>
float fmha_fwd_splitkv_(const ck_tile::stream_config& s, fmha_fwd_args a)
float fmha_fwd_splitkv_(const ck_tile::stream_config& s, fmha_fwd_
splitkv_
args a)
{{
{{
if(s.log_level_ > 0)
if(s.log_level_ > 0)
std::cout
std::cout
...
@@ -217,22 +229,96 @@ float fmha_fwd_splitkv_(const ck_tile::stream_config& s, fmha_fwd_args a)
...
@@ -217,22 +229,96 @@ float fmha_fwd_splitkv_(const ck_tile::stream_config& s, fmha_fwd_args a)
);
);
}}
}}
float fmha_fwd_splitkv(fmha_fwd_traits t, fmha_fwd_args a, const ck_tile::stream_config& s){{
float fmha_fwd_splitkv(fmha_fwd_
splitkv_
traits t, fmha_fwd_
splitkv_
args a, const ck_tile::stream_config& s){{
float r = -1;
float r = -1;
{F_dispatch}
{F_dispatch}
return r;
return r;
}}
}}
"""
"""
FMHA_FWD_SPLITKV_API_INNER_DISPATCH
=
""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})
&& (t.has_dropout == {F_dropout})
&& (t.do_fp8_static_quant == {F_squant}) &&
FMHA_FWD_SPLITKV_API_INNER_DISPATCH
=
""" {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse}) && (t.do_fp8_static_quant == {F_squant}) &&
({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
((a.block_table_ptr != nullptr) == {F_pagedkv}) &&
({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
using traits_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_
dropou
t}, {F_
squant
}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
using traits_ = fmha_fwd_
splitkv_
traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_
squan
t}, {F_
pagedkv
}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>;
using traits2_ = fmha_fwd_splitkv_combine_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}/2, {F_bn1}, {F_lse}, {F_squant}, {F_spad}, {F_dvpad}>;
return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
return fmha_fwd_splitkv_<traits_, traits2_>(s, a);
}}
}}
"""
"""
@
dataclass
class
FmhaFwdSplitKVApiTrait
:
pipeline_tag
:
str
# sync with fmha_fwd_traits<>, to generate fallback calls
hdim
:
str
dtype
:
str
# data type
mode
:
str
# value from MODE_MAP
bm0
:
int
# tile size along q seqlen (block size)
bn0
:
int
# tile size along qk seqlen
bk0
:
int
# tile size along qk gemm unroll
bn1
:
int
# tile size along v head_dim
bk1
:
int
# tile size along kv gemm unroll
bk0blen
:
int
vlayout
:
str
mask
:
str
bias
:
str
#
lse
:
str
#
squant
:
str
#
spad
:
str
skpad
:
str
dpad
:
str
dvpad
:
str
pagedkv
:
str
@
property
def
name
(
self
)
->
str
:
return
f
'
{
self
.
hdim
}
-
{
self
.
dtype
}
-
{
self
.
mode
}
-
{
self
.
bm0
}
-
{
self
.
bn0
}
-
{
self
.
bk0
}
-
{
self
.
bn0
}
-
{
self
.
bk1
}
-
{
self
.
bk0blen
}
-'
+
\
f
'
{
self
.
vlayout
}
-
{
self
.
mask
}
-
{
self
.
bias
}
-
{
self
.
lse
}
-
{
self
.
squant
}
-
{
self
.
spad
}
-
{
self
.
skpad
}
-
{
self
.
dpad
}
-'
+
\
f
'
{
self
.
dvpad
}
-
{
self
.
pagedkv
}
'
@
property
def
scheck
(
self
)
->
str
:
if
self
.
mode
==
'group'
:
return
'true/*group mode spad always true*/'
# group mode only generate spad/skpad == true
if
self
.
pipeline_tag
==
'qr_async'
:
if
self
.
spad
==
't'
:
return
'true'
# always support
else
:
return
'true'
elif
self
.
pipeline_tag
in
[
'qr'
]:
if
self
.
spad
==
't'
:
return
f
'true /*a.seqlen_q %
{
self
.
bm0
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
else
:
return
f
'a.seqlen_q %
{
self
.
bm0
}
== 0'
else
:
assert
False
@
property
def
skcheck
(
self
)
->
str
:
if
self
.
mode
==
'group'
:
return
'true/*group mode skpad always true*/'
# group mode only generate spad/skpad == true
if
self
.
pipeline_tag
==
'qr_async'
:
if
self
.
skpad
==
't'
:
return
f
'a.seqlen_k == 0 || a.seqlen_k %
{
self
.
bn0
}
!= 0'
else
:
return
f
'a.seqlen_k != 0 && a.seqlen_k %
{
self
.
bn0
}
== 0'
elif
self
.
pipeline_tag
in
[
'qr'
,
'qr_fp8'
]:
if
self
.
skpad
==
't'
:
return
f
'true /*a.seqlen_k %
{
self
.
bn0
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
else
:
return
f
'a.seqlen_k %
{
self
.
bn0
}
== 0'
else
:
assert
False
@
property
def
dcheck
(
self
)
->
str
:
if
self
.
pipeline_tag
==
'qr_async'
:
vec
=
int
((
32
*
4
)
/
DTYPE_BITS
[
self
.
dtype
])
if
self
.
dpad
==
't'
:
return
f
'a.hdim_q %
{
vec
}
== 0'
else
:
assert
False
elif
self
.
pipeline_tag
in
[
'qr'
]:
if
self
.
dpad
==
't'
:
return
f
'true /*a.hdim_q %
{
self
.
bk0blen
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
else
:
return
f
'a.hdim_q %
{
self
.
bk0blen
}
== 0'
else
:
assert
False
@
property
def
dvcheck
(
self
)
->
str
:
if
self
.
pipeline_tag
==
'qr_async'
:
vec
=
int
((
32
*
4
)
/
DTYPE_BITS
[
self
.
dtype
])
if
self
.
dvpad
==
't'
:
return
f
'a.hdim_v %
{
vec
}
== 0'
else
:
assert
False
elif
self
.
pipeline_tag
in
[
'qr'
]:
if
self
.
dvpad
==
't'
:
return
f
'true /*a.hdim_v %
{
self
.
bk0blen
}
!= 0*/'
# TODO: order of get_pipelines() matters! (ugly)
else
:
return
f
'a.hdim_v %
{
self
.
bk0blen
}
== 0'
else
:
assert
False
@
dataclass
@
dataclass
class
FmhaFwdSplitKVPipeline
:
class
FmhaFwdSplitKVPipeline
:
tag
:
str
tag
:
str
...
@@ -244,8 +330,8 @@ class FmhaFwdSplitKVPipeline:
...
@@ -244,8 +330,8 @@ class FmhaFwdSplitKVPipeline:
F_dvpad
:
str
#
F_dvpad
:
str
#
F_bias
:
str
# true/false
F_bias
:
str
# true/false
F_lse
:
str
#
F_lse
:
str
#
F_dropout
:
str
#
F_squant
:
str
#
F_squant
:
str
#
F_pagedkv
:
str
# t/f
F_mask
:
str
# value from MASK_MAP
F_mask
:
str
# value from MASK_MAP
@
property
@
property
...
@@ -267,8 +353,8 @@ class FmhaFwdSplitKVPipeline:
...
@@ -267,8 +353,8 @@ class FmhaFwdSplitKVPipeline:
else
:
else
:
if
self
.
F_mask
!=
'no'
:
n
+=
f
'_m
{
self
.
F_mask
[
0
]
}
'
if
self
.
F_mask
!=
'no'
:
n
+=
f
'_m
{
self
.
F_mask
[
0
]
}
'
if
self
.
F_lse
==
't'
:
n
+=
'_lse'
if
self
.
F_lse
==
't'
:
n
+=
'_lse'
if
self
.
F_dropout
==
't'
:
n
+=
'_dropout'
if
self
.
F_squant
==
't'
:
n
+=
'_squant'
if
self
.
F_squant
==
't'
:
n
+=
'_squant'
if
self
.
F_pagedkv
==
't'
:
n
+=
'_pagedkv'
return
n
return
n
@
dataclass
@
dataclass
...
@@ -300,7 +386,7 @@ class FmhaFwdSplitKVApiPool:
...
@@ -300,7 +386,7 @@ class FmhaFwdSplitKVApiPool:
self
.
pool
=
dict
()
self
.
pool
=
dict
()
self
.
mask_impl
=
mask_impl
self
.
mask_impl
=
mask_impl
def
register_traits
(
self
,
trait
:
FmhaFwdApiTrait
)
->
None
:
def
register_traits
(
self
,
trait
:
FmhaFwd
SplitKV
ApiTrait
)
->
None
:
# TODO: do we need to check duplication?
# TODO: do we need to check duplication?
if
trait
.
dtype
not
in
self
.
pool
.
keys
():
if
trait
.
dtype
not
in
self
.
pool
.
keys
():
self
.
pool
[
trait
.
dtype
]
=
dict
()
self
.
pool
[
trait
.
dtype
]
=
dict
()
...
@@ -322,8 +408,8 @@ class FmhaFwdSplitKVApiPool:
...
@@ -322,8 +408,8 @@ class FmhaFwdSplitKVApiPool:
inners
=
inners
+
FMHA_FWD_SPLITKV_API_INNER_DISPATCH
.
format
(
F_if
=
if_k
,
F_mode
=
MODE_MAP
[
trait
.
mode
],
F_vlayout
=
LAYOUT_MAP
[
trait
.
vlayout
],
inners
=
inners
+
FMHA_FWD_SPLITKV_API_INNER_DISPATCH
.
format
(
F_if
=
if_k
,
F_mode
=
MODE_MAP
[
trait
.
mode
],
F_vlayout
=
LAYOUT_MAP
[
trait
.
vlayout
],
F_pipeline_enum
=
PIPELINE_ENUM_MAP
[
trait
.
pipeline_tag
],
F_mask
=
get_mask_map
(
self
.
mask_impl
)[
trait
.
mask
],
F_pipeline_enum
=
PIPELINE_ENUM_MAP
[
trait
.
pipeline_tag
],
F_mask
=
get_mask_map
(
self
.
mask_impl
)[
trait
.
mask
],
F_mask_check
=
get_mask_check_map
(
self
.
mask_impl
)[
trait
.
mask
],
F_bias_check
=
BIAS_CHECK_MAP
[
trait
.
bias
],
F_bias
=
BIAS_MAP
[
trait
.
bias
],
F_mask_check
=
get_mask_check_map
(
self
.
mask_impl
)[
trait
.
mask
],
F_bias_check
=
BIAS_CHECK_MAP
[
trait
.
bias
],
F_bias
=
BIAS_MAP
[
trait
.
bias
],
F_lse
=
BOOL_MAP
[
trait
.
lse
],
F_
dropou
t
=
BOOL_MAP
[
trait
.
dropout
]
,
F_lse
=
BOOL_MAP
[
trait
.
lse
],
F_
squan
t
=
BOOL_MAP
[
trait
.
squant
],
F_pagedkv
=
BOOL_MAP
[
trait
.
pagedkv
],
F_squant
=
BOOL_MAP
[
trait
.
squant
],
F_scheck
=
trait
.
scheck
,
F_skcheck
=
trait
.
skcheck
,
F_dcheck
=
trait
.
dcheck
,
F_dvcheck
=
trait
.
dvcheck
,
F_scheck
=
trait
.
scheck
,
F_skcheck
=
trait
.
skcheck
,
F_dcheck
=
trait
.
dcheck
,
F_dvcheck
=
trait
.
dvcheck
,
F_spad
=
BOOL_MAP
[
trait
.
spad
],
F_skpad
=
BOOL_MAP
[
trait
.
skpad
],
F_dpad
=
BOOL_MAP
[
trait
.
dpad
],
F_dvpad
=
BOOL_MAP
[
trait
.
dvpad
],
F_spad
=
BOOL_MAP
[
trait
.
spad
],
F_skpad
=
BOOL_MAP
[
trait
.
skpad
],
F_dpad
=
BOOL_MAP
[
trait
.
dpad
],
F_dvpad
=
BOOL_MAP
[
trait
.
dvpad
],
F_bm0
=
trait
.
bm0
,
F_bn0
=
trait
.
bn0
,
F_bk0
=
trait
.
bk0
,
F_bn1
=
trait
.
bn1
,
F_bk1
=
trait
.
bk1
,
F_bk0blen
=
trait
.
bk0blen
,
F_bm0
=
trait
.
bm0
,
F_bn0
=
trait
.
bn0
,
F_bk0
=
trait
.
bk0
,
F_bn1
=
trait
.
bn1
,
F_bk1
=
trait
.
bk1
,
F_bk0blen
=
trait
.
bk0blen
,
F_hdim
=
hdim
,
F_dtype
=
DTYPE_MAP
[
dtype
])
F_hdim
=
hdim
,
F_dtype
=
DTYPE_MAP
[
dtype
])
...
@@ -383,8 +469,8 @@ class FmhaFwdSplitKVKernel:
...
@@ -383,8 +469,8 @@ class FmhaFwdSplitKVKernel:
F_dvpad
=
BOOL_MAP
[
self
.
F_pipeline
.
F_dvpad
],
F_dvpad
=
BOOL_MAP
[
self
.
F_pipeline
.
F_dvpad
],
F_bias
=
BIAS_MAP
[
self
.
F_pipeline
.
F_bias
],
F_bias
=
BIAS_MAP
[
self
.
F_pipeline
.
F_bias
],
F_lse
=
BOOL_MAP
[
self
.
F_pipeline
.
F_lse
],
F_lse
=
BOOL_MAP
[
self
.
F_pipeline
.
F_lse
],
F_dropout
=
BOOL_MAP
[
self
.
F_pipeline
.
F_dropout
],
F_squant
=
BOOL_MAP
[
self
.
F_pipeline
.
F_squant
],
F_squant
=
BOOL_MAP
[
self
.
F_pipeline
.
F_squant
],
F_pagedkv
=
BOOL_MAP
[
self
.
F_pipeline
.
F_pagedkv
],
F_occupancy
=
self
.
F_tile
.
F_occupancy
,
F_occupancy
=
self
.
F_tile
.
F_occupancy
,
F_pipeline_enum
=
PIPELINE_ENUM_MAP
[
self
.
F_pipeline
.
tag
],
F_pipeline_enum
=
PIPELINE_ENUM_MAP
[
self
.
F_pipeline
.
tag
],
F_mask
=
get_mask_map
(
self
.
mask_impl
)[
self
.
F_pipeline
.
F_mask
],
F_mask
=
get_mask_map
(
self
.
mask_impl
)[
self
.
F_pipeline
.
F_mask
],
...
@@ -401,8 +487,8 @@ class FmhaFwdSplitKVKernel:
...
@@ -401,8 +487,8 @@ class FmhaFwdSplitKVKernel:
def
filename
(
self
)
->
str
:
def
filename
(
self
)
->
str
:
return
self
.
name
+
".cpp"
return
self
.
name
+
".cpp"
def
api_trait
(
self
)
->
FmhaFwdApiTrait
:
def
api_trait
(
self
)
->
FmhaFwd
SplitKV
ApiTrait
:
return
FmhaFwdApiTrait
(
return
FmhaFwd
SplitKV
ApiTrait
(
pipeline_tag
=
self
.
F_pipeline
.
tag
,
pipeline_tag
=
self
.
F_pipeline
.
tag
,
hdim
=
str
(
self
.
F_hdim
),
hdim
=
str
(
self
.
F_hdim
),
dtype
=
self
.
F_dtype
,
dtype
=
self
.
F_dtype
,
...
@@ -417,8 +503,8 @@ class FmhaFwdSplitKVKernel:
...
@@ -417,8 +503,8 @@ class FmhaFwdSplitKVKernel:
mask
=
self
.
F_pipeline
.
F_mask
,
mask
=
self
.
F_pipeline
.
F_mask
,
bias
=
self
.
F_pipeline
.
F_bias
,
bias
=
self
.
F_pipeline
.
F_bias
,
lse
=
self
.
F_pipeline
.
F_lse
,
lse
=
self
.
F_pipeline
.
F_lse
,
dropout
=
self
.
F_pipeline
.
F_dropout
,
squant
=
self
.
F_pipeline
.
F_squant
,
squant
=
self
.
F_pipeline
.
F_squant
,
pagedkv
=
self
.
F_pipeline
.
F_pagedkv
,
spad
=
self
.
F_pipeline
.
F_spad
,
spad
=
self
.
F_pipeline
.
F_spad
,
skpad
=
self
.
F_pipeline
.
F_skpad
,
skpad
=
self
.
F_pipeline
.
F_skpad
,
dpad
=
self
.
F_pipeline
.
F_dpad
,
dpad
=
self
.
F_pipeline
.
F_dpad
,
...
@@ -460,29 +546,6 @@ class FmhaFwdSplitKVCombineKernel:
...
@@ -460,29 +546,6 @@ class FmhaFwdSplitKVCombineKernel:
def
filename
(
self
)
->
str
:
def
filename
(
self
)
->
str
:
return
self
.
name
+
".cpp"
return
self
.
name
+
".cpp"
def
api_trait
(
self
)
->
FmhaFwdApiTrait
:
return
FmhaFwdApiTrait
(
pipeline_tag
=
self
.
F_pipeline
.
tag
,
hdim
=
str
(
self
.
F_hdim
),
dtype
=
self
.
F_dtype
,
mode
=
self
.
F_mode
,
bm0
=
self
.
F_tile
.
F_bm0
,
bn0
=
self
.
F_tile
.
F_bn0
,
bk0
=
self
.
F_tile
.
F_bk0
,
bn1
=
self
.
F_tile
.
F_bn1
,
bk1
=
self
.
F_tile
.
F_bk1
,
bk0blen
=
self
.
F_tile
.
F_bk0blen
,
vlayout
=
self
.
F_pipeline
.
F_vlayout
,
mask
=
self
.
F_pipeline
.
F_mask
,
bias
=
self
.
F_pipeline
.
F_bias
,
lse
=
self
.
F_pipeline
.
F_lse
,
dropout
=
self
.
F_pipeline
.
F_dropout
,
squant
=
self
.
F_pipeline
.
F_squant
,
spad
=
self
.
F_pipeline
.
F_spad
,
skpad
=
self
.
F_pipeline
.
F_skpad
,
dpad
=
self
.
F_pipeline
.
F_dpad
,
dvpad
=
self
.
F_pipeline
.
F_dvpad
)
# TODO: design a more practical way to do it
# TODO: design a more practical way to do it
# this is current supported tile size per hdim
# this is current supported tile size per hdim
def
get_fmha_fwd_tile_dict_from_dtype
(
dtype
:
str
)
->
Optional
[
dict
]:
def
get_fmha_fwd_tile_dict_from_dtype
(
dtype
:
str
)
->
Optional
[
dict
]:
...
@@ -533,27 +596,27 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
...
@@ -533,27 +596,27 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
squant
=
't'
if
dtype
==
'fp8'
else
'f'
squant
=
't'
if
dtype
==
'fp8'
else
'f'
pipelines
=
[]
pipelines
=
[]
if
dtype
in
[
'fp16'
,
'bf16'
]:
if
dtype
in
[
'fp16'
,
'bf16'
]:
# splitkv kernel donot support dropout
for
mask
,
bias
,
lse
,
pagedkv
in
itertools
.
product
(
get_mask_map
(
mask_impl
).
keys
(),
BIAS_MAP
.
keys
(),
[
"t"
,
"f"
],
[
"t"
,
"f"
]):
for
mask
,
bias
,
lse
,
dropout
in
itertools
.
product
(
get_mask_map
(
mask_impl
).
keys
(),
BIAS_MAP
.
keys
(),
[
"t"
,
"f"
],
[
"f"
]):
# TODO: use async pipeline when compiler is more stable
if
hdim
==
256
:
if
hdim
==
256
or
hdim
in
[
32
,
64
,
128
]
:
# if True:
# if True:
pipelines
.
append
(
Pipeline
(
'qr'
,
'row'
,
'f'
,
'f'
,
'f'
,
'f'
,
bias
,
lse
,
dropout
,
squant
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr'
,
'row'
,
'f'
,
'f'
,
'f'
,
'f'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr'
,
'col'
,
'f'
,
'f'
,
'f'
,
'f'
,
bias
,
lse
,
dropout
,
squant
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr'
,
'col'
,
'f'
,
'f'
,
'f'
,
'f'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr'
,
'row'
,
't'
,
't'
,
't'
,
't'
,
bias
,
lse
,
dropout
,
squant
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr'
,
'row'
,
't'
,
't'
,
't'
,
't'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr'
,
'col'
,
't'
,
't'
,
't'
,
't'
,
bias
,
lse
,
dropout
,
squant
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr'
,
'col'
,
't'
,
't'
,
't'
,
't'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
else
:
else
:
pipelines
.
append
(
Pipeline
(
'qr_async'
,
'row'
,
't'
,
'f'
,
't'
,
't'
,
bias
,
lse
,
dropout
,
squant
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr_async'
,
'row'
,
't'
,
'f'
,
't'
,
't'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr_async'
,
'row'
,
't'
,
't'
,
't'
,
't'
,
bias
,
lse
,
dropout
,
squant
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr_async'
,
'row'
,
't'
,
't'
,
't'
,
't'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr_async'
,
'col'
,
't'
,
'f'
,
't'
,
't'
,
bias
,
lse
,
dropout
,
squant
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr_async'
,
'col'
,
't'
,
'f'
,
't'
,
't'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr_async'
,
'col'
,
't'
,
't'
,
't'
,
't'
,
bias
,
lse
,
dropout
,
squant
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr_async'
,
'col'
,
't'
,
't'
,
't'
,
't'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
if
receipt
==
1
:
if
receipt
==
1
:
pipelines
.
append
(
Pipeline
(
'qr'
,
'row'
,
't'
,
't'
,
't'
,
't'
,
bias
,
lse
,
dropout
,
squant
,
mask
))
# TODO: cover arbitraty hdim
pipelines
.
append
(
Pipeline
(
'qr'
,
'row'
,
't'
,
't'
,
't'
,
't'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
# TODO: cover arbitraty hdim
pipelines
.
append
(
Pipeline
(
'qr'
,
'col'
,
't'
,
'f'
,
't'
,
't'
,
bias
,
lse
,
dropout
,
squant
,
mask
))
# TODO: cover arbitraty hdim
pipelines
.
append
(
Pipeline
(
'qr'
,
'col'
,
't'
,
'f'
,
't'
,
't'
,
bias
,
lse
,
squant
,
pagedkv
,
mask
))
# TODO: cover arbitraty hdim
elif
dtype
in
[
'fp8'
,
'bf8'
]:
elif
dtype
in
[
'fp8'
,
'bf8'
]:
# no need lse/
dropout
kernels
# no need lse/
paged-kv
kernels
for
mask
,
bias
in
itertools
.
product
(
get_mask_map
(
mask_impl
).
keys
(),
BIAS_MAP
.
keys
()):
for
mask
,
bias
in
itertools
.
product
(
get_mask_map
(
mask_impl
).
keys
(),
BIAS_MAP
.
keys
()):
pipelines
.
append
(
Pipeline
(
'qr'
,
'col'
,
'f'
,
'f'
,
'f'
,
'f'
,
bias
,
'f'
,
'f'
,
squant
,
mask
))
pipelines
.
append
(
Pipeline
(
'qr'
,
'col'
,
'f'
,
'f'
,
'f'
,
'f'
,
bias
,
'f'
,
squant
,
'f'
,
mask
))
else
:
else
:
assert
False
assert
False
return
pipelines
return
pipelines
...
@@ -574,6 +637,9 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
...
@@ -574,6 +637,9 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
if
pipeline
.
F_spad
!=
't'
or
pipeline
.
F_skpad
!=
't'
:
if
pipeline
.
F_spad
!=
't'
or
pipeline
.
F_skpad
!=
't'
:
# in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
# in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
continue
continue
if
pipeline
.
F_pagedkv
==
't'
:
# we only use batch mode kernels to handle (paged-) kvcache problems
continue
k
=
Kernel
(
F_idx
=
0
,
k
=
Kernel
(
F_idx
=
0
,
F_hdim
=
hdim
,
F_hdim
=
hdim
,
F_dtype
=
dtype
,
F_dtype
=
dtype
,
...
...
Prev
1
2
3
4
5
6
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment