Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
77404e83
Commit
77404e83
authored
Mar 22, 2023
by
ltqin
Browse files
expose deviece instance interface
parent
bb673452
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
184 additions
and
196 deletions
+184
-196
client_example/08_fused_attention/CMakeLists.txt
client_example/08_fused_attention/CMakeLists.txt
+2
-3
client_example/08_fused_attention/fused_attention_no_lib.cpp
client_example/08_fused_attention/fused_attention_no_lib.cpp
+57
-69
library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_multiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instance.hpp
...ax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instance.hpp
+124
-0
library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_multiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instance.cpp
...ax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instance.cpp
+1
-124
No files found.
client_example/08_fused_attention/CMakeLists.txt
View file @
77404e83
...
@@ -4,8 +4,7 @@ target_link_libraries(client_fused_attention PRIVATE composable_kernel::device_o
...
@@ -4,8 +4,7 @@ target_link_libraries(client_fused_attention PRIVATE composable_kernel::device_o
add_executable
(
client_fused_attention_bias fused_attention_bias.cpp
)
add_executable
(
client_fused_attention_bias fused_attention_bias.cpp
)
target_link_libraries
(
client_fused_attention_bias PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_fused_attention_bias PRIVATE composable_kernel::device_operations
)
add_executable
(
client_fused_attention_mask fused_attention_mask.cpp
)
target_link_libraries
(
client_fused_attention_mask PRIVATE composable_kernel::device_operations
)
add_executable
(
client_fused_attention_bias_mask fused_attention_bias_mask.cpp
)
add_executable
(
client_fused_attention_bias_mask fused_attention_bias_mask.cpp
)
target_link_libraries
(
client_fused_attention_bias_mask PRIVATE composable_kernel::device_operations
)
target_link_libraries
(
client_fused_attention_bias_mask PRIVATE composable_kernel::device_operations
)
add_executable
(
client_fused_attention_no_lib fused_attention_no_lib.cpp
)
client_example/08_fused_attention/fused_attention_
mask
.cpp
→
client_example/08_fused_attention/fused_attention_
no_lib
.cpp
View file @
77404e83
...
@@ -5,14 +5,14 @@
...
@@ -5,14 +5,14 @@
#include <vector>
#include <vector>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute
_general
.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute
/device_batched_gemm_multiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instance
.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
B0ElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
B0ElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Acc0ElementOp
=
ck
::
tensor_operation
::
element_wise
::
Scale
Mask
;
using
Acc0ElementOp
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
B1ElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
B1ElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
...
@@ -23,7 +23,6 @@ using ADataType = ck::half_t;
...
@@ -23,7 +23,6 @@ using ADataType = ck::half_t;
using
B0DataType
=
ck
::
half_t
;
using
B0DataType
=
ck
::
half_t
;
using
B1DataType
=
ck
::
half_t
;
using
B1DataType
=
ck
::
half_t
;
using
CDataType
=
ck
::
half_t
;
using
CDataType
=
ck
::
half_t
;
using
D0DataType
=
int32_t
;
using
AccDataType
=
float
;
using
AccDataType
=
float
;
struct
SimpleDeviceMem
struct
SimpleDeviceMem
...
@@ -42,7 +41,7 @@ struct SimpleDeviceMem
...
@@ -42,7 +41,7 @@ struct SimpleDeviceMem
void
*
p_mem_
;
void
*
p_mem_
;
};
};
int
main
(
int
argc
,
char
*
argv
[]
)
int
main
()
{
{
int
G0
=
48
;
int
G0
=
48
;
int
G1
=
16
;
int
G1
=
16
;
...
@@ -50,7 +49,7 @@ int main(int argc, char* argv[])
...
@@ -50,7 +49,7 @@ int main(int argc, char* argv[])
int
N
=
1024
;
int
N
=
1024
;
int
K
=
64
;
int
K
=
64
;
int
O
=
64
;
int
O
=
64
;
// A layout [G0, M, G1, K]
// A layout [G0, M, G1, K]
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_lengths
{
G0
,
G1
,
M
,
K
};
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_lengths
{
G0
,
G1
,
M
,
K
};
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_strides
{
M
*
G1
*
K
,
K
,
G1
*
K
,
1
};
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_strides
{
M
*
G1
*
K
,
K
,
G1
*
K
,
1
};
...
@@ -67,13 +66,8 @@ int main(int argc, char* argv[])
...
@@ -67,13 +66,8 @@ int main(int argc, char* argv[])
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_lengths
{
G0
,
G1
,
M
,
O
};
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_lengths
{
G0
,
G1
,
M
,
O
};
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_strides
{
M
*
G1
*
O
,
O
,
G1
*
O
,
1
};
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_strides
{
M
*
G1
*
O
,
O
,
G1
*
O
,
1
};
// D layout [G0, M, G1, N]
std
::
vector
<
ck
::
index_t
>
d0_gs_ms_ns_lengths
{
G0
,
G1
,
M
,
N
};
std
::
vector
<
ck
::
index_t
>
d0_gs_ms_ns_strides
{
M
*
G1
*
N
,
N
,
G1
*
N
,
1
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
G0
*
G1
*
M
*
K
);
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
G0
*
G1
*
M
*
K
);
SimpleDeviceMem
b0_device_buf
(
sizeof
(
B0DataType
)
*
G0
*
G1
*
N
*
K
);
SimpleDeviceMem
b0_device_buf
(
sizeof
(
B0DataType
)
*
G0
*
G1
*
N
*
K
);
SimpleDeviceMem
d0_device_buf
(
sizeof
(
D0DataType
)
*
G0
*
G1
*
M
*
N
);
SimpleDeviceMem
b1_device_buf
(
sizeof
(
B1DataType
)
*
G0
*
G1
*
O
*
N
);
SimpleDeviceMem
b1_device_buf
(
sizeof
(
B1DataType
)
*
G0
*
G1
*
O
*
N
);
SimpleDeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
G0
*
G1
*
M
*
O
);
SimpleDeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
G0
*
G1
*
M
*
O
);
...
@@ -87,7 +81,7 @@ int main(int argc, char* argv[])
...
@@ -87,7 +81,7 @@ int main(int argc, char* argv[])
B0DataType
,
B0DataType
,
B1DataType
,
B1DataType
,
CDataType
,
CDataType
,
ck
::
Tuple
<
D0DataType
>
,
ck
::
Tuple
<>
,
ck
::
Tuple
<>
,
ck
::
Tuple
<>
,
AElementOp
,
AElementOp
,
B0ElementOp
,
B0ElementOp
,
...
@@ -95,10 +89,11 @@ int main(int argc, char* argv[])
...
@@ -95,10 +89,11 @@ int main(int argc, char* argv[])
B1ElementOp
,
B1ElementOp
,
CElementOp
,
CElementOp
,
MaskingSpec
>
;
MaskingSpec
>
;
// get device op instances
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
DeviceOp
>::
GetInstances
();
ck
::
tensor_operation
::
device
::
instance
::
add_device_batched_gemm_mutiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instances
(
op_ptrs
);
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
...
@@ -111,35 +106,32 @@ int main(int argc, char* argv[])
...
@@ -111,35 +106,32 @@ int main(int argc, char* argv[])
// profile device op instances
// profile device op instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
in
t
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
for
(
size_
t
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
a_device_buf
.
GetDeviceBuffer
(),
b0_device_buf
.
GetDeviceBuffer
(),
b0_device_buf
.
GetDeviceBuffer
(),
b1_device_buf
.
GetDeviceBuffer
(),
b1_device_buf
.
GetDeviceBuffer
(),
c_device_buf
.
GetDeviceBuffer
(),
c_device_buf
.
GetDeviceBuffer
(),
{},
// p_acc0_biases
std
::
array
<
void
*
,
1
>
{
d0_device_buf
.
GetDeviceBuffer
()},
// p_acc0_biases
{},
// p_acc1_biases
{},
// p_acc1_biases
a_gs_ms_ks_lengths
,
a_gs_ms_ks_lengths
,
a_gs_ms_ks_strides
,
a_gs_ms_ks_strides
,
b0_gs_ns_ks_lengths
,
b0_gs_ns_ks_lengths
,
b0_gs_ns_ks_strides
,
b0_gs_ns_ks_strides
,
b1_gs_os_ns_lengths
,
b1_gs_os_ns_lengths
,
b1_gs_os_ns_strides
,
b1_gs_os_ns_strides
,
c_gs_ms_os_lengths
,
c_gs_ms_os_lengths
,
c_gs_ms_os_strides
,
c_gs_ms_os_strides
,
{},
// acc0_biases_gs_ms_ns_lengths
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
{},
// acc0_biases_gs_ms_ns_strides
d0_gs_ms_ns_lengths
},
// acc0_biases_gs_ms_ns_lengths
{},
// acc1_biases_gs_ms_os_lengths
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
{},
// acc1_biases_gs_ms_os_strides
d0_gs_ms_ns_strides
},
// acc0_biases_gs_ms_ns_strides
AElementOp
{},
{},
// acc1_biases_gs_ms_os_lengths
B0ElementOp
{},
{},
// acc1_biases_gs_ms_os_strides
Acc0ElementOp
{
1
/
sqrtf
(
K
)},
AElementOp
{},
B1ElementOp
{},
B0ElementOp
{},
CElementOp
{});
Acc0ElementOp
{
1
/
sqrtf
(
K
),
0.1
},
B1ElementOp
{},
CElementOp
{});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
...
@@ -151,8 +143,7 @@ int main(int argc, char* argv[])
...
@@ -151,8 +143,7 @@ int main(int argc, char* argv[])
std
::
size_t
flop
=
(
size_t
(
M
)
*
N
*
K
*
2
+
size_t
(
M
)
*
N
*
O
*
2
)
*
G0
*
G1
;
std
::
size_t
flop
=
(
size_t
(
M
)
*
N
*
K
*
2
+
size_t
(
M
)
*
N
*
O
*
2
)
*
G0
*
G1
;
std
::
size_t
num_btype
=
(
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
B0DataType
)
*
K
*
N
+
std
::
size_t
num_btype
=
(
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
B0DataType
)
*
K
*
N
+
sizeof
(
B1DataType
)
*
N
*
O
+
sizeof
(
CDataType
)
*
M
*
O
+
sizeof
(
B1DataType
)
*
N
*
O
+
sizeof
(
CDataType
)
*
M
*
O
)
*
sizeof
(
D0DataType
)
*
M
*
N
)
*
G0
*
G1
;
G0
*
G1
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
@@ -185,32 +176,29 @@ int main(int argc, char* argv[])
...
@@ -185,32 +176,29 @@ int main(int argc, char* argv[])
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
a_device_buf
.
GetDeviceBuffer
(),
b0_device_buf
.
GetDeviceBuffer
(),
b0_device_buf
.
GetDeviceBuffer
(),
b1_device_buf
.
GetDeviceBuffer
(),
b1_device_buf
.
GetDeviceBuffer
(),
c_device_buf
.
GetDeviceBuffer
(),
c_device_buf
.
GetDeviceBuffer
(),
{},
// p_acc0_biases
std
::
array
<
void
*
,
1
>
{
d0_device_buf
.
GetDeviceBuffer
()},
// p_acc0_biases
{},
// p_acc1_biases
{},
// p_acc1_biases
a_gs_ms_ks_lengths
,
a_gs_ms_ks_lengths
,
a_gs_ms_ks_strides
,
a_gs_ms_ks_strides
,
b0_gs_ns_ks_lengths
,
b0_gs_ns_ks_lengths
,
b0_gs_ns_ks_strides
,
b0_gs_ns_ks_strides
,
b1_gs_os_ns_lengths
,
b1_gs_os_ns_lengths
,
b1_gs_os_ns_strides
,
b1_gs_os_ns_strides
,
c_gs_ms_os_lengths
,
c_gs_ms_os_lengths
,
c_gs_ms_os_strides
,
c_gs_ms_os_strides
,
{},
// acc0_biases_gs_ms_ns_lengths
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
{},
// acc0_biases_gs_ms_ns_strides
d0_gs_ms_ns_lengths
},
// acc0_biases_gs_ms_ns_lengths
{},
// acc1_biases_gs_ms_os_lengths
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
{},
// acc1_biases_gs_ms_os_strides
d0_gs_ms_ns_strides
},
// acc0_biases_gs_ms_ns_strides
AElementOp
{},
{},
// acc1_biases_gs_ms_os_lengths
B0ElementOp
{},
{},
// acc1_biases_gs_ms_os_strides
Acc0ElementOp
{
1
/
sqrtf
(
K
)},
AElementOp
{},
B1ElementOp
{},
B0ElementOp
{},
CElementOp
{});
Acc0ElementOp
{
1
/
sqrtf
(
K
),
0.1
},
B1ElementOp
{},
CElementOp
{});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
...
...
library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_multiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instance.hpp
0 → 100644
View file @
77404e83
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ScaleMask
=
ck
::
tensor_operation
::
element_wise
::
ScaleMask
;
using
ScaleBiasMask
=
ck
::
tensor_operation
::
element_wise
::
ScaleBiasMask
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
GemmPadded
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKOPadding
;
static
constexpr
auto
TensorDefault
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Default
;
// c[g, m, n] = a[g, m, k] * b[g, n, k]
template
<
index_t
NumDimG
,
index_t
NumDimM
,
index_t
NumDimN
,
index_t
NumDimK
,
index_t
NumDimO
,
typename
DataType
,
typename
AccDataType
,
typename
D0DataTypes
,
typename
AD0ElementwiseOp
,
MaskingSpecialization
MaskingSpec
>
using
device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instances
=
std
::
tuple
<
// clang-format off
// #############################################| NumDimG| NumDimM| NumDimN| NumDimK| NumDimO| AData| B0Data| B1Data| CData| Acc0BiasData| Acc1BiasData| AccData| CShuffle| A| B0| Acc0| B1| C| GEMM| ATensorSpec| B0TensorSpec| B1TensorSpec| CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| MaskingSpec|
// #############################################| | | | | | Type| Type| Type| Type| ype| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| | | | | Prefetch| Size| MPer| NPer| KPer| NPer| KPer| | | | XDL| XDL| MXdl| NXdl| NXdl| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| |
// #############################################| | | | | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| | | | | | Stage| | Block| Block| Block| Block| Block| | | | | | Per| Per| Per| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| |
// #############################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Wave| Wave| Wave| | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
256
,
128
,
32
,
64
,
32
,
8
,
8
,
2
,
32
,
32
,
2
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
2
,
S
<
1
,
32
,
1
,
8
>
,
8
,
MaskingSpec
>
,
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
256
,
128
,
32
,
128
,
32
,
8
,
8
,
2
,
32
,
32
,
2
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
2
,
S
<
1
,
32
,
1
,
8
>
,
8
,
MaskingSpec
>
,
#if CK_WORKAROUND_SWDEV_388832
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
128
,
256
,
32
,
64
,
32
,
8
,
8
,
2
,
32
,
32
,
1
,
8
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
2
,
S
<
1
,
32
,
1
,
8
>
,
8
,
MaskingSpec
>
,
#endif
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
128
,
256
,
32
,
128
,
32
,
8
,
8
,
2
,
32
,
32
,
1
,
8
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
2
,
S
<
1
,
32
,
1
,
8
>
,
8
,
MaskingSpec
>
,
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
128
,
128
,
64
,
64
,
32
,
8
,
8
,
2
,
32
,
32
,
1
,
4
,
2
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
false
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
false
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
2
,
S
<
1
,
32
,
1
,
8
>
,
8
,
MaskingSpec
>
,
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
128
,
128
,
32
,
64
,
32
,
8
,
8
,
2
,
32
,
32
,
1
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
2
,
S
<
1
,
32
,
1
,
8
>
,
8
,
MaskingSpec
>
,
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
128
,
128
,
64
,
128
,
32
,
8
,
8
,
2
,
32
,
32
,
1
,
4
,
4
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
false
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
false
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
2
,
S
<
1
,
32
,
1
,
8
>
,
8
,
MaskingSpec
>
,
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
128
,
128
,
32
,
128
,
32
,
8
,
8
,
2
,
32
,
32
,
1
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
2
,
S
<
1
,
32
,
1
,
8
>
,
8
,
MaskingSpec
>
,
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
64
,
256
,
32
,
128
,
32
,
8
,
8
,
2
,
16
,
16
,
1
,
16
,
8
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
8
,
S
<
1
,
16
,
1
,
16
>
,
8
,
MaskingSpec
>
,
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
64
,
256
,
32
,
64
,
32
,
8
,
8
,
2
,
16
,
16
,
1
,
16
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
4
,
S
<
1
,
32
,
1
,
8
>
,
8
,
MaskingSpec
>
,
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
64
,
256
,
64
,
128
,
32
,
8
,
8
,
2
,
16
,
16
,
1
,
16
,
8
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
8
,
S
<
1
,
16
,
1
,
16
>
,
8
,
MaskingSpec
>
,
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
64
,
256
,
64
,
64
,
32
,
8
,
8
,
2
,
16
,
16
,
1
,
16
,
4
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
4
,
S
<
1
,
32
,
1
,
8
>
,
8
,
MaskingSpec
>
,
// Padded fallback kernel
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmPadded
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
128
,
128
,
64
,
128
,
32
,
8
,
8
,
2
,
32
,
32
,
1
,
4
,
4
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
false
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
false
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
2
,
S
<
1
,
32
,
1
,
8
>
,
8
,
MaskingSpec
>
,
DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
DataType
,
DataType
,
DataType
,
DataType
,
D0DataTypes
,
ck
::
Tuple
<>
,
AccDataType
,
DataType
,
PassThrough
,
PassThrough
,
AD0ElementwiseOp
,
PassThrough
,
PassThrough
,
GemmPadded
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
TensorDefault
,
1
,
256
,
128
,
64
,
32
,
128
,
32
,
8
,
8
,
2
,
32
,
32
,
1
,
2
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
8
,
32
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
2
,
S
<
1
,
32
,
1
,
8
>
,
8
,
MaskingSpec
>
// clang-format on
>
;
template
<
index_t
NumDimG
,
index_t
NumDimM
,
index_t
NumDimN
,
index_t
NumDimK
,
index_t
NumDimO
,
typename
ADataType
,
typename
B0DataType
,
typename
B1DataType
,
typename
CDataType
,
typename
Acc0BiasDataType
,
typename
Acc1BiasDataType
,
typename
AElementwiseOperation
,
typename
B0ElementwiseOperation
,
typename
C0DEElementwiseOperation
,
typename
B1ElementwiseOperation
,
typename
C1DEElementwiseOperation
,
MaskingSpecialization
MaskingSpec
>
void
add_device_batched_gemm_mutiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceBatchedGemmSoftmaxGemmPermute
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
ADataType
,
B0DataType
,
B1DataType
,
CDataType
,
Acc0BiasDataType
,
Acc1BiasDataType
,
AElementwiseOperation
,
B0ElementwiseOperation
,
C0DEElementwiseOperation
,
B1ElementwiseOperation
,
C1DEElementwiseOperation
,
MaskingSpec
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instances
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
ADataType
,
F32
,
Acc0BiasDataType
,
C0DEElementwiseOperation
,
MaskingSpec
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_multiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instance.cpp
View file @
77404e83
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment