Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
ea38a958
Commit
ea38a958
authored
Apr 22, 2024
by
root
Browse files
add
parent
162d0305
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
594 additions
and
17 deletions
+594
-17
client_example/30_gemm_bf16Aint8B_add_fastgelu/CMakeLists.txt
...nt_example/30_gemm_bf16Aint8B_add_fastgelu/CMakeLists.txt
+3
-0
client_example/30_gemm_bf16Aint8B_add_fastgelu/gemm_multiply_add_fastgelu_xdl_bf16_i8.cpp
...B_add_fastgelu/gemm_multiply_add_fastgelu_xdl_bf16_i8.cpp
+220
-0
client_example/30_gemm_bf16Aint8B_add_fastgelu/gemm_multiply_xdl_bf16_i8.cpp
...emm_bf16Aint8B_add_fastgelu/gemm_multiply_xdl_bf16_i8.cpp
+214
-0
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
...pu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
+6
-2
library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
..._operation_instance/device_operation_instance_factory.hpp
+2
-0
library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply.hpp
...k/library/tensor_operation_instance/gpu/gemm_multiply.hpp
+88
-0
library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
...nsor_operation_instance/gpu/gemm_universal/CMakeLists.txt
+1
-0
library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i8_bf16_multi_d/device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn.hpp
...vice_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn.hpp
+14
-14
library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i8_bf16_multi_d/device_gemm_xdl_universal_multiply_add_fastgelu_bf16_i8_bf16_mk_kn_mn_mnkpadding_instance.cpp
...dd_fastgelu_bf16_i8_bf16_mk_kn_mn_mnkpadding_instance.cpp
+3
-1
library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i8_bf16_multi_d/device_gemm_xdl_universal_multiply_bf16_i8_bf16_mk_kn_mn_mnkpadding_instance.cpp
...al_multiply_bf16_i8_bf16_mk_kn_mn_mnkpadding_instance.cpp
+43
-0
No files found.
client_example/30_gemm_bf16Aint8B_add_fastgelu/CMakeLists.txt
View file @
ea38a958
...
...
@@ -5,6 +5,9 @@ if(GPU_TARGETS MATCHES "gfx9" AND ((DTYPES MATCHES "int8" AND DTYPES MATCHES "bf
add_executable
(
client_gemm_multiply_add_fastgelu_xdl_bf16_i8 gemm_multiply_add_fastgelu_xdl_bf16_i8.cpp
)
target_link_libraries
(
client_gemm_multiply_add_fastgelu_xdl_bf16_i8 PRIVATE composable_kernel::device_gemm_operations
)
add_executable
(
client_gemm_multiply_xdl_bf16_i8 gemm_multiply_xdl_bf16_i8.cpp
)
target_link_libraries
(
client_gemm_multiply_xdl_bf16_i8 PRIVATE composable_kernel::device_gemm_operations
)
add_executable
(
client_gemm_bias_bf16_i8_bf16 gemm_bias_xdl_bf16_i8.cpp
)
target_link_libraries
(
client_gemm_bias_bf16_i8_bf16 PRIVATE composable_kernel::device_gemm_operations
)
...
...
client_example/30_gemm_bf16Aint8B_add_fastgelu/gemm_multiply_add_fastgelu_xdl_bf16_i8.cpp
0 → 100644
View file @
ea38a958
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iomanip>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_multiply_add_fastgelu.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
BF16
=
ck
::
bhalf_t
;
using
I8
=
int8_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
A0DataType
=
BF16
;
using
B0DataType
=
I8
;
using
B1DataType
=
BF16
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
BF16
;
using
D0DataType
=
BF16
;
using
DsDataType
=
ck
::
Tuple
<
B1DataType
,
D0DataType
>
;
using
EDataType
=
BF16
;
using
A0Layout
=
Row
;
using
B0Layout
=
Row
;
using
B1Layout
=
Row
;
using
D0Layout
=
Row
;
using
DsLayout
=
ck
::
Tuple
<
B1Layout
,
D0Layout
>
;
using
ELayout
=
Row
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
MultiplyAddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
MultiplyAddFastGelu
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
MultiplyAddFastGelu
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
// clang-format on
int
main
(
int
argc
,
char
*
argv
[])
{
// GEMM shape
ck
::
index_t
M
=
4096
;
ck
::
index_t
N
=
768
;
ck
::
index_t
K
=
6144
;
ck
::
index_t
StrideA
=
K
;
ck
::
index_t
StrideB
=
N
;
ck
::
index_t
StrideD
=
N
;
ck
::
index_t
StrideE
=
N
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
8
)
{
M
=
std
::
stoi
(
argv
[
1
]);
N
=
std
::
stoi
(
argv
[
2
]);
K
=
std
::
stoi
(
argv
[
3
]);
StrideA
=
std
::
stoi
(
argv
[
4
]);
StrideB
=
std
::
stoi
(
argv
[
5
]);
StrideD
=
std
::
stoi
(
argv
[
6
]);
StrideE
=
std
::
stoi
(
argv
[
7
]);
}
else
{
printf
(
"arg1 to 7: M, N, K, StrideA, StrideB, StrideD, StrideE
\n
"
);
exit
(
0
);
}
auto
f_matrix_space_size
=
[](
std
::
size_t
nRow
,
std
::
size_t
nCol
,
std
::
size_t
stride
,
auto
layout
)
{
using
Layout
=
decltype
(
layout
);
if
constexpr
(
std
::
is_same
<
Layout
,
Row
>::
value
)
{
return
(
nRow
-
1
)
*
stride
+
nCol
;
}
else
{
return
(
nCol
-
1
)
*
stride
+
nRow
;
}
};
SimpleDeviceMem
a0_device_buf
(
sizeof
(
A0DataType
)
*
f_matrix_space_size
(
M
,
K
,
StrideA
,
A0Layout
{}));
SimpleDeviceMem
b0_device_buf
(
sizeof
(
B0DataType
)
*
f_matrix_space_size
(
K
,
N
,
StrideB
,
B0Layout
{}));
SimpleDeviceMem
b1_device_buf
(
sizeof
(
B1DataType
)
*
f_matrix_space_size
(
K
,
N
,
0
,
B1Layout
{}));
SimpleDeviceMem
d0_device_buf
(
sizeof
(
D0DataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideD
,
ELayout
{}));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideE
,
ELayout
{}));
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
cde_element_op
=
CDEElementOp
{};
constexpr
ck
::
index_t
NumDTensor
=
2
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
A0Layout
,
B0Layout
,
DsLayout
,
ELayout
,
A0DataType
,
B0DataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
>
;
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a0_device_buf
.
GetDeviceBuffer
(),
b0_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
NumDTensor
>
{
b1_device_buf
.
GetDeviceBuffer
(),
d0_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
std
::
array
<
ck
::
index_t
,
NumDTensor
>
{
0
,
StrideD
},
StrideE
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
A0DataType
)
*
M
*
K
+
sizeof
(
B0DataType
)
*
K
*
N
+
sizeof
(
EDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
return
0
;
}
client_example/30_gemm_bf16Aint8B_add_fastgelu/gemm_multiply_xdl_bf16_i8.cpp
0 → 100644
View file @
ea38a958
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iomanip>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_multiply.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
BF16
=
ck
::
bhalf_t
;
using
I8
=
int8_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
A0DataType
=
BF16
;
using
B0DataType
=
I8
;
using
B1DataType
=
BF16
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
BF16
;
using
DsDataType
=
ck
::
Tuple
<
B1DataType
>
;
using
EDataType
=
BF16
;
using
A0Layout
=
Row
;
using
B0Layout
=
Row
;
using
B1Layout
=
Row
;
using
D0Layout
=
Row
;
using
DsLayout
=
ck
::
Tuple
<
B1Layout
>
;
using
ELayout
=
Row
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Multiply
=
ck
::
tensor_operation
::
element_wise
::
Multiply
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
Multiply
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
// clang-format on
int
main
(
int
argc
,
char
*
argv
[])
{
// GEMM shape
ck
::
index_t
M
=
4096
;
ck
::
index_t
N
=
768
;
ck
::
index_t
K
=
6144
;
ck
::
index_t
StrideA
=
K
;
ck
::
index_t
StrideB
=
N
;
ck
::
index_t
StrideE
=
N
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
7
)
{
M
=
std
::
stoi
(
argv
[
1
]);
N
=
std
::
stoi
(
argv
[
2
]);
K
=
std
::
stoi
(
argv
[
3
]);
StrideA
=
std
::
stoi
(
argv
[
4
]);
StrideB
=
std
::
stoi
(
argv
[
5
]);
StrideE
=
std
::
stoi
(
argv
[
6
]);
}
else
{
printf
(
"arg1 to 7: M, N, K, StrideA, StrideB, StrideE
\n
"
);
exit
(
0
);
}
auto
f_matrix_space_size
=
[](
std
::
size_t
nRow
,
std
::
size_t
nCol
,
std
::
size_t
stride
,
auto
layout
)
{
using
Layout
=
decltype
(
layout
);
if
constexpr
(
std
::
is_same
<
Layout
,
Row
>::
value
)
{
return
(
nRow
-
1
)
*
stride
+
nCol
;
}
else
{
return
(
nCol
-
1
)
*
stride
+
nRow
;
}
};
SimpleDeviceMem
a0_device_buf
(
sizeof
(
A0DataType
)
*
f_matrix_space_size
(
M
,
K
,
StrideA
,
A0Layout
{}));
SimpleDeviceMem
b0_device_buf
(
sizeof
(
B0DataType
)
*
f_matrix_space_size
(
K
,
N
,
StrideB
,
B0Layout
{}));
SimpleDeviceMem
b1_device_buf
(
sizeof
(
B1DataType
)
*
f_matrix_space_size
(
K
,
N
,
0
,
B1Layout
{}));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideE
,
ELayout
{}));
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
cde_element_op
=
CDEElementOp
{};
constexpr
ck
::
index_t
NumDTensor
=
1
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
A0Layout
,
B0Layout
,
DsLayout
,
ELayout
,
A0DataType
,
B0DataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
>
;
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
bool
found
=
false
;
int
best_op_id
=
-
1
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a0_device_buf
.
GetDeviceBuffer
(),
b0_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
NumDTensor
>
{
b1_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
std
::
array
<
ck
::
index_t
,
NumDTensor
>
{
0
},
StrideE
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
,
0
,
20
,
50
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
A0DataType
)
*
M
*
K
+
sizeof
(
B0DataType
)
*
K
*
N
+
sizeof
(
EDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
found
=
true
;
best_op_id
=
i
;
best_op_name
=
op_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
return
0
;
}
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
View file @
ea38a958
...
...
@@ -65,7 +65,9 @@ template <typename ALayout,
BlockGemmPipelineScheduler
BlkGemmPipeSched
=
BlockGemmPipelineScheduler
::
Intrawave
,
BlockGemmPipelineVersion
BlkGemmPipelineVer
=
BlockGemmPipelineVersion
::
v1
,
typename
ComputeTypeA
=
CDataType
,
typename
ComputeTypeB
=
ComputeTypeA
>
typename
ComputeTypeB
=
ComputeTypeA
,
typename
LDSTypeA
=
ComputeTypeA
,
typename
LDSTypeB
=
ComputeTypeB
>
struct
DeviceGemmMultiD_Xdl_CShuffle_V3
:
public
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
DsLayout
,
...
...
@@ -128,7 +130,9 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleD<ALayout,
BlkGemmPipeSched
,
BlkGemmPipelineVer
,
ComputeTypeA
,
ComputeTypeB
>
;
ComputeTypeB
,
LDSTypeA
,
LDSTypeB
>
;
using
Argument
=
typename
GridwiseGemm
::
Argument
;
...
...
library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
View file @
ea38a958
...
...
@@ -32,6 +32,7 @@ using F16_F16_Tuple = ck::Tuple<F16, F16>;
using
F64_Tuple
=
ck
::
Tuple
<
F64
>
;
using
F32_Tuple
=
ck
::
Tuple
<
F32
>
;
using
BF16_Tuple
=
ck
::
Tuple
<
BF16
>
;
using
I32_Tuple
=
ck
::
Tuple
<
I32
>
;
using
I32_F32_Tuple
=
ck
::
Tuple
<
I32
,
F32
>
;
using
I8_Tuple
=
ck
::
Tuple
<
I8
>
;
...
...
@@ -99,6 +100,7 @@ using Scale = ck::tensor_operation::element_wise::Scale;
using
Bilinear
=
ck
::
tensor_operation
::
element_wise
::
Bilinear
;
using
AddAddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
AddAddFastGelu
;
using
MultiplyAddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
MultiplyAddFastGelu
;
using
Multiply
=
ck
::
tensor_operation
::
element_wise
::
Multiply
;
using
AddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
AddFastGelu
;
using
AddRelu
=
ck
::
tensor_operation
::
element_wise
::
AddRelu
;
using
AddSilu
=
ck
::
tensor_operation
::
element_wise
::
AddSilu
;
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply.hpp
0 → 100644
View file @
ea38a958
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn_multiply_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Row
,
Row_Tuple
,
Row
,
BF16
,
I8
,
BF16_Tuple
,
BF16
,
PassThrough
,
PassThrough
,
Multiply
>>>&
);
// GEMM + Multiply + Add + FastGelu
template
<
typename
ALayout
,
typename
BLayout
,
typename
D0Layout
,
typename
ELayout
,
typename
ADataType
,
typename
BDataType
,
typename
D0DataType
,
typename
EDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
ck
::
Tuple
<
D0Layout
>
,
ELayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<
D0DataType
>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
Multiply
>>
{
using
DeviceOp
=
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
ck
::
Tuple
<
D0Layout
>
,
ELayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<
D0DataType
>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
Multiply
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
ADataType
,
bhalf_t
>
&&
is_same_v
<
BDataType
,
int8_t
>
&&
is_same_v
<
D0DataType
,
bhalf_t
>
&&
is_same_v
<
EDataType
,
bhalf_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
D0Layout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn_multiply_mnkpadding_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
View file @
ea38a958
...
...
@@ -66,6 +66,7 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES
device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
device_gemm_xdl_universal_bf16_i8_bf16_multi_d/device_gemm_xdl_universal_multiply_bf16_i8_bf16_mk_kn_mn_mnkpadding_instance.cpp
device_gemm_xdl_universal_bf16_i8_bf16_multi_d/device_gemm_xdl_universal_multiply_add_fastgelu_bf16_i8_bf16_mk_kn_mn_mnkpadding_instance.cpp
)
...
...
library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i8_bf16_multi_d/device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn.hpp
View file @
ea38a958
...
...
@@ -26,6 +26,7 @@ using S = Sequence<Is...>;
using
PassThrough
=
element_wise
::
PassThrough
;
using
MultiplyAddFastGelu
=
element_wise
::
MultiplyAddFastGelu
;
using
Multiply
=
element_wise
::
Multiply
;
static
constexpr
auto
GemmDefault
=
GemmSpecialization
::
Default
;
static
constexpr
auto
GemmKPadding
=
GemmSpecialization
::
KPadding
;
...
...
@@ -35,9 +36,7 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
static
constexpr
auto
Intrawave
=
BlockGemmPipelineScheduler
::
Intrawave
;
static
constexpr
auto
Interwave
=
BlockGemmPipelineScheduler
::
Interwave
;
using
DsLayout
=
ck
::
Tuple
<
Row
,
Row
>
;
template
<
typename
DsDType
,
typename
CElementwiseOp
,
GemmSpecialization
GemmSpec
>
template
<
typename
DsLayout
,
typename
DsDType
,
typename
CElementwiseOp
,
GemmSpecialization
GemmSpec
>
using
device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn_comp_instances
=
std
::
tuple
<
// clang-format off
//#########################| ALayout| BLayout| CLayout|AData| BData| DsData| CData| AccData| Cshuffle| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1|MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Block-wiseGemm| Block-wiseGemm|
...
...
@@ -56,7 +55,8 @@ using device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn_comp_instances = s
// clang-format on
>
;
template
<
typename
DsDType
,
template
<
typename
DsLayout
,
typename
DsDType
,
typename
CElementwiseOp
,
GemmSpecialization
GemmSpec
,
BlockGemmPipelineScheduler
BlkGemmPipeSched
>
...
...
@@ -68,17 +68,17 @@ using device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn_mem_instances = st
//#########################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
// Latency friendly
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
64
,
16
,
16
,
256
,
8
,
4
,
16
,
16
,
1
,
1
,
S
<
32
,
2
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
64
,
1
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v1
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
128
,
16
,
32
,
256
,
8
,
4
,
16
,
16
,
1
,
1
,
S
<
32
,
4
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
64
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v1
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
64
,
16
,
16
,
256
,
8
,
4
,
16
,
16
,
1
,
1
,
S
<
32
,
2
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
64
,
1
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v1
,
BF16
,
BF16
,
BF16
,
I8
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
128
,
16
,
32
,
256
,
8
,
4
,
16
,
16
,
1
,
1
,
S
<
32
,
4
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
64
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v1
,
BF16
,
BF16
,
BF16
,
I8
>
,
// Memory friendly
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
64
,
16
,
16
,
256
,
8
,
4
,
16
,
16
,
1
,
1
,
S
<
32
,
2
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
64
,
1
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
128
,
16
,
32
,
256
,
8
,
4
,
16
,
16
,
1
,
1
,
S
<
32
,
4
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
64
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
128
,
16
,
64
,
128
,
8
,
4
,
16
,
16
,
1
,
2
,
S
<
16
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
32
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
128
,
32
,
64
,
128
,
8
,
4
,
32
,
32
,
1
,
1
,
S
<
16
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
32
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
128
,
16
,
128
,
64
,
8
,
4
,
16
,
16
,
1
,
4
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
16
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
128
,
32
,
128
,
64
,
8
,
4
,
32
,
32
,
1
,
2
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
16
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
256
,
16
,
256
,
64
,
8
,
4
,
16
,
16
,
1
,
4
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
256
,
32
,
256
,
64
,
8
,
4
,
32
,
32
,
1
,
2
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
8
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
>
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
64
,
16
,
16
,
256
,
8
,
4
,
16
,
16
,
1
,
1
,
S
<
32
,
2
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
64
,
1
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
4
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
BF16
,
BF16
,
BF16
,
I8
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
128
,
16
,
32
,
256
,
8
,
4
,
16
,
16
,
1
,
1
,
S
<
32
,
4
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
64
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
BF16
,
BF16
,
BF16
,
I8
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
128
,
16
,
64
,
128
,
8
,
4
,
16
,
16
,
1
,
2
,
S
<
16
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
32
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
BF16
,
BF16
,
BF16
,
I8
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
128
,
32
,
64
,
128
,
8
,
4
,
32
,
32
,
1
,
1
,
S
<
16
,
8
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
32
,
4
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
BF16
,
BF16
,
BF16
,
I8
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
128
,
16
,
128
,
64
,
8
,
4
,
16
,
16
,
1
,
4
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
16
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
BF16
,
BF16
,
BF16
,
I8
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
128
,
32
,
128
,
64
,
8
,
4
,
32
,
32
,
1
,
2
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
16
,
8
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
8
>
,
8
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
BF16
,
BF16
,
BF16
,
I8
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
256
,
16
,
256
,
64
,
8
,
4
,
16
,
16
,
1
,
4
,
S
<
8
,
16
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
4
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
BF16
,
BF16
,
BF16
,
I8
>
,
DeviceGemmMultiD_Xdl_CShuffle_V3
<
Row
,
Row
,
DsLayout
,
Row
,
BF16
,
I8
,
DsDType
,
BF16
,
F32
,
F32
,
PassThrough
,
PassThrough
,
CElementwiseOp
,
GemmSpec
,
256
,
32
,
256
,
64
,
8
,
4
,
32
,
32
,
1
,
2
,
S
<
8
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
0
,
S
<
16
,
16
,
1
>
,
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
16
,
4
,
0
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
8
,
BlkGemmPipeSched
,
BlockGemmPipelineVersion
::
v2
,
BF16
,
BF16
,
BF16
,
I8
>
// clang-format on
>
;
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i8_bf16_multi_d/device_gemm_xdl_universal_multiply_add_fastgelu_bf16_i8_bf16_mk_kn_mn_mnkpadding_instance.cpp
View file @
ea38a958
...
...
@@ -24,13 +24,15 @@ void add_device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn_multiply_add_fa
add_device_operation_instances
(
instances
,
device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn_comp_instances
<
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
BF16
,
BF16
>
,
MultiplyAddFastGelu
,
GemmMNKPadding
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn_mem_instances
<
ck
::
Tuple
<
BF16
,
BF16
>
,
device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn_mem_instances
<
ck
::
Tuple
<
Row
,
Row
>
,
ck
::
Tuple
<
BF16
,
BF16
>
,
MultiplyAddFastGelu
,
GemmMNKPadding
,
Intrawave
>
{});
...
...
library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i8_bf16_multi_d/device_gemm_xdl_universal_multiply_bf16_i8_bf16_mk_kn_mn_mnkpadding_instance.cpp
0 → 100644
View file @
ea38a958
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn_multiply_mnkpadding_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Row
,
ck
::
Tuple
<
Row
>
,
Row
,
BF16
,
I8
,
ck
::
Tuple
<
BF16
>
,
BF16
,
PassThrough
,
PassThrough
,
Multiply
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn_comp_instances
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
BF16
>
,
Multiply
,
GemmMNKPadding
>
{});
add_device_operation_instances
(
instances
,
device_gemm_xdl_universal_multi_d_bf16_i8_bf16_mk_kn_mn_mem_instances
<
ck
::
Tuple
<
Row
>
,
ck
::
Tuple
<
BF16
>
,
Multiply
,
GemmMNKPadding
,
Intrawave
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment