Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
91c33aec
Commit
91c33aec
authored
Jun 01, 2023
by
Bartlomiej Kocot
Committed by
Bartłomiej Kocot
Jun 09, 2023
Browse files
Add DeviceBatchedGemmMultipleD_Dl
parent
016ebaa7
Changes
32
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
906 additions
and
130 deletions
+906
-130
profiler/include/profiler/profile_batched_gemm_impl.hpp
profiler/include/profiler/profile_batched_gemm_impl.hpp
+62
-32
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+2
-1
profiler/src/profile_batched_gemm.cpp
profiler/src/profile_batched_gemm.cpp
+68
-49
profiler/src/profile_batched_gemm_multi_d.cpp
profiler/src/profile_batched_gemm_multi_d.cpp
+190
-0
test/CMakeLists.txt
test/CMakeLists.txt
+1
-0
test/batched_gemm/batched_gemm_bf16.cpp
test/batched_gemm/batched_gemm_bf16.cpp
+82
-12
test/batched_gemm/batched_gemm_fp16.cpp
test/batched_gemm/batched_gemm_fp16.cpp
+82
-12
test/batched_gemm/batched_gemm_fp32.cpp
test/batched_gemm/batched_gemm_fp32.cpp
+82
-12
test/batched_gemm/batched_gemm_int8.cpp
test/batched_gemm/batched_gemm_int8.cpp
+82
-12
test/batched_gemm_multi_d/CMakeLists.txt
test/batched_gemm_multi_d/CMakeLists.txt
+7
-0
test/batched_gemm_multi_d/batched_gemm_multi_d_fp16.cpp
test/batched_gemm_multi_d/batched_gemm_multi_d_fp16.cpp
+124
-0
test/batched_gemm_multi_d/batched_gemm_multi_d_int8.cpp
test/batched_gemm_multi_d/batched_gemm_multi_d_int8.cpp
+124
-0
No files found.
profiler/include/profiler/profile_batched_gemm_impl.hpp
View file @
91c33aec
...
...
@@ -8,9 +8,11 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
@@ -27,7 +29,11 @@ template <typename ADataType,
typename
CDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
typename
CLayout
,
typename
AElementOp
,
typename
BElementOp
,
typename
CElementOp
,
typename
DeviceOp
>
bool
profile_batched_gemm_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
...
...
@@ -88,10 +94,6 @@ bool profile_batched_gemm_impl(int do_verification,
b_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
}
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
c_element_op
=
CElementOp
{};
...
...
@@ -124,16 +126,6 @@ bool profile_batched_gemm_impl(int do_verification,
b_device_buf
.
ToDevice
(
b_g_k_n
.
mData
.
data
());
c_device_buf
.
ToDevice
(
c_g_m_n_device_result
.
mData
.
data
());
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
...
...
@@ -148,23 +140,61 @@ bool profile_batched_gemm_impl(int do_verification,
// profile device op instances
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
BatchStrideA
,
BatchStrideB
,
BatchStrideC
,
BatchCount
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{});
std
::
unique_ptr
<
tensor_operation
::
device
::
BaseArgument
>
argument_ptr
;
if
constexpr
(
std
::
is_same
<
DeviceOp
,
ck
::
tensor_operation
::
device
::
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>>::
value
)
{
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
BatchStrideA
,
BatchStrideB
,
BatchStrideC
,
BatchCount
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{});
}
else
{
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
{},
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
M
,
N
,
K
,
BatchCount
,
StrideA
,
StrideB
,
{},
StrideC
,
BatchStrideA
,
BatchStrideB
,
{},
BatchStrideC
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{});
}
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
...
...
profiler/src/CMakeLists.txt
View file @
91c33aec
...
...
@@ -34,6 +34,7 @@ set(PROFILER_SOURCES
profile_grouped_gemm_fastgelu.cpp
profile_contraction_bilinear.cpp
profile_contraction_scale.cpp
profile_batched_gemm_multi_d.cpp
)
set
(
PROFILER_EXECUTABLE ckProfiler
)
...
...
@@ -77,5 +78,5 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgel
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_bilinear_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_scale_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_pool_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_multi_d_instance
)
rocm_install
(
TARGETS
${
PROFILER_EXECUTABLE
}
COMPONENT profiler
)
profiler/src/profile_batched_gemm.cpp
View file @
91c33aec
...
...
@@ -10,6 +10,8 @@
#include "profiler/profile_batched_gemm_impl.hpp"
#include "profiler_operation_registry.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
enum
struct
GemmMatrixLayout
{
MK_KN_MN
,
// 0
...
...
@@ -78,55 +80,72 @@ int profile_batched_gemm(int argc, char* argv[])
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
auto
profile
=
[
&
](
auto
a_type
,
auto
b_type
,
auto
c_type
,
auto
a_layout
,
auto
b_layout
,
auto
c_layout
)
{
using
ADataType
=
decltype
(
a_type
);
using
BDataType
=
decltype
(
b_type
);
using
CDataType
=
decltype
(
c_type
);
using
ALayout
=
decltype
(
a_layout
);
using
BLayout
=
decltype
(
b_layout
);
using
CLayout
=
decltype
(
c_layout
);
const
int
DefaultStrideA
=
ck
::
is_same_v
<
ALayout
,
Row
>
?
K
:
M
;
const
int
DefaultStrideB
=
ck
::
is_same_v
<
BLayout
,
Row
>
?
N
:
K
;
const
int
DefaultStrideC
=
ck
::
is_same_v
<
CLayout
,
Row
>
?
N
:
M
;
const
int
StrideA_
=
(
StrideA
<
0
)
?
DefaultStrideA
:
StrideA
;
const
int
StrideB_
=
(
StrideB
<
0
)
?
DefaultStrideB
:
StrideB
;
const
int
StrideC_
=
(
StrideC
<
0
)
?
DefaultStrideC
:
StrideC
;
const
int
DefaultBatchStrideA
=
(
ck
::
is_same_v
<
ALayout
,
Row
>
?
M
:
K
)
*
StrideA_
;
const
int
DefaultBatchStrideB
=
(
ck
::
is_same_v
<
BLayout
,
Row
>
?
K
:
N
)
*
StrideB_
;
const
int
DefaultBatchStrideC
=
(
ck
::
is_same_v
<
CLayout
,
Row
>
?
M
:
N
)
*
StrideC_
;
const
int
BatchStrideA_
=
(
BatchStrideA
<
0
)
?
DefaultBatchStrideA
:
BatchStrideA
;
const
int
BatchStrideB_
=
(
BatchStrideB
<
0
)
?
DefaultBatchStrideB
:
BatchStrideB
;
const
int
BatchStrideC_
=
(
BatchStrideC
<
0
)
?
DefaultBatchStrideC
:
BatchStrideC
;
bool
pass
=
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
ALayout
,
BLayout
,
CLayout
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
BatchStrideA_
,
BatchStrideB_
,
BatchStrideC_
,
StrideA_
,
StrideB_
,
StrideC_
,
BatchCount
);
return
pass
?
0
:
1
;
};
auto
profile
=
[
&
](
auto
a_type
,
auto
b_type
,
auto
c_type
,
auto
a_layout
,
auto
b_layout
,
auto
c_layout
)
{
using
ADataType
=
decltype
(
a_type
);
using
BDataType
=
decltype
(
b_type
);
using
CDataType
=
decltype
(
c_type
);
using
ALayout
=
decltype
(
a_layout
);
using
BLayout
=
decltype
(
b_layout
);
using
CLayout
=
decltype
(
c_layout
);
const
int
DefaultStrideA
=
ck
::
is_same_v
<
ALayout
,
Row
>
?
K
:
M
;
const
int
DefaultStrideB
=
ck
::
is_same_v
<
BLayout
,
Row
>
?
N
:
K
;
const
int
DefaultStrideC
=
ck
::
is_same_v
<
CLayout
,
Row
>
?
N
:
M
;
const
int
StrideA_
=
(
StrideA
<
0
)
?
DefaultStrideA
:
StrideA
;
const
int
StrideB_
=
(
StrideB
<
0
)
?
DefaultStrideB
:
StrideB
;
const
int
StrideC_
=
(
StrideC
<
0
)
?
DefaultStrideC
:
StrideC
;
const
int
DefaultBatchStrideA
=
(
ck
::
is_same_v
<
ALayout
,
Row
>
?
M
:
K
)
*
StrideA_
;
const
int
DefaultBatchStrideB
=
(
ck
::
is_same_v
<
BLayout
,
Row
>
?
K
:
N
)
*
StrideB_
;
const
int
DefaultBatchStrideC
=
(
ck
::
is_same_v
<
CLayout
,
Row
>
?
M
:
N
)
*
StrideC_
;
const
int
BatchStrideA_
=
(
BatchStrideA
<
0
)
?
DefaultBatchStrideA
:
BatchStrideA
;
const
int
BatchStrideB_
=
(
BatchStrideB
<
0
)
?
DefaultBatchStrideB
:
BatchStrideB
;
const
int
BatchStrideC_
=
(
BatchStrideC
<
0
)
?
DefaultBatchStrideC
:
BatchStrideC
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
bool
pass
=
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
ALayout
,
BLayout
,
CLayout
,
AElementOp
,
BElementOp
,
CElementOp
,
DeviceOp
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
BatchStrideA_
,
BatchStrideB_
,
BatchStrideC_
,
StrideA_
,
StrideB_
,
StrideC_
,
BatchCount
);
return
pass
?
0
:
1
;
};
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
...
...
profiler/src/profile_batched_gemm_multi_d.cpp
0 → 100644
View file @
91c33aec
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdint>
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_batched_gemm_impl.hpp"
#include "profiler_operation_registry.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
enum
struct
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
};
enum
struct
GemmDataType
{
F16_F16_F16
,
// 0
INT8_INT8_INT8
,
// 1
};
#define OP_NAME "batched_gemm_multi_d"
#define OP_DESC "Batched GEMM multi D"
int
profile_batched_gemm_multi_d
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
18
)
{
// clang-format off
printf
(
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
);
printf
(
"arg2: data type (0: fp16; 1: int8)
\n
"
);
printf
(
"arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];
\n
"
);
printf
(
" 1: A[g, m, k] * B[g, n, k] = C[g, m, n];
\n
"
);
printf
(
" 2: A[g, k, m] * B[g, k, n] = C[g, m, n];
\n
"
);
printf
(
" 3: A[g, k, m] * B[g, n, k] = C[g, m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg6: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: time kernel (0=n0, 1=yes)
\n
"
);
printf
(
"arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount
\n
"
);
// clang-format on
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
GemmDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
K
=
std
::
stoi
(
argv
[
10
]);
const
int
StrideA
=
std
::
stoi
(
argv
[
11
]);
const
int
StrideB
=
std
::
stoi
(
argv
[
12
]);
const
int
StrideC
=
std
::
stoi
(
argv
[
13
]);
const
int
BatchStrideA
=
std
::
stoi
(
argv
[
14
]);
const
int
BatchStrideB
=
std
::
stoi
(
argv
[
15
]);
const
int
BatchStrideC
=
std
::
stoi
(
argv
[
16
]);
const
int
BatchCount
=
std
::
stoi
(
argv
[
17
]);
using
F16
=
ck
::
half_t
;
using
INT8
=
int8_t
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
auto
profile
=
[
&
](
auto
a_type
,
auto
b_type
,
auto
c_type
,
auto
a_layout
,
auto
b_layout
,
auto
c_layout
)
{
using
ADataType
=
decltype
(
a_type
);
using
BDataType
=
decltype
(
b_type
);
using
CDataType
=
decltype
(
c_type
);
using
DsDataType
=
ck
::
Tuple
<>
;
using
ALayout
=
decltype
(
a_layout
);
using
BLayout
=
decltype
(
b_layout
);
using
CLayout
=
decltype
(
c_layout
);
using
DsLayout
=
ck
::
Tuple
<>
;
const
int
DefaultStrideA
=
ck
::
is_same_v
<
ALayout
,
Row
>
?
K
:
M
;
const
int
DefaultStrideB
=
ck
::
is_same_v
<
BLayout
,
Row
>
?
N
:
K
;
const
int
DefaultStrideC
=
ck
::
is_same_v
<
CLayout
,
Row
>
?
N
:
M
;
const
int
StrideA_
=
(
StrideA
<
0
)
?
DefaultStrideA
:
StrideA
;
const
int
StrideB_
=
(
StrideB
<
0
)
?
DefaultStrideB
:
StrideB
;
const
int
StrideC_
=
(
StrideC
<
0
)
?
DefaultStrideC
:
StrideC
;
const
int
DefaultBatchStrideA
=
(
ck
::
is_same_v
<
ALayout
,
Row
>
?
M
:
K
)
*
StrideA_
;
const
int
DefaultBatchStrideB
=
(
ck
::
is_same_v
<
BLayout
,
Row
>
?
K
:
N
)
*
StrideB_
;
const
int
DefaultBatchStrideC
=
(
ck
::
is_same_v
<
CLayout
,
Row
>
?
M
:
N
)
*
StrideC_
;
const
int
BatchStrideA_
=
(
BatchStrideA
<
0
)
?
DefaultBatchStrideA
:
BatchStrideA
;
const
int
BatchStrideB_
=
(
BatchStrideB
<
0
)
?
DefaultBatchStrideB
:
BatchStrideB
;
const
int
BatchStrideC_
=
(
BatchStrideC
<
0
)
?
DefaultBatchStrideC
:
BatchStrideC
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceBatchedGemmMultiD
<
ALayout
,
BLayout
,
DsLayout
,
CLayout
,
ADataType
,
BDataType
,
DsDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
bool
pass
=
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
ALayout
,
BLayout
,
CLayout
,
AElementOp
,
BElementOp
,
CElementOp
,
DeviceOp
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
M
,
N
,
K
,
BatchStrideA_
,
BatchStrideB_
,
BatchStrideC_
,
StrideA_
,
StrideB_
,
StrideC_
,
BatchCount
);
return
pass
?
0
:
1
;
};
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
F16
{},
F16
{},
F16
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
return
profile
(
F16
{},
F16
{},
F16
{},
Row
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
return
profile
(
F16
{},
F16
{},
F16
{},
Col
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
return
profile
(
F16
{},
F16
{},
F16
{},
Col
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
INT8
{},
INT8
{},
INT8
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
return
profile
(
INT8
{},
INT8
{},
INT8
{},
Row
{},
Col
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
return
profile
(
INT8
{},
INT8
{},
INT8
{},
Col
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
return
profile
(
INT8
{},
INT8
{},
INT8
{},
Col
{},
Col
{},
Row
{});
}
else
{
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
return
1
;
}
}
REGISTER_PROFILER_OPERATION
(
OP_NAME
,
OP_DESC
,
profile_batched_gemm_multi_d
);
test/CMakeLists.txt
View file @
91c33aec
...
...
@@ -58,6 +58,7 @@ add_subdirectory(elementwise_normalization)
add_subdirectory
(
batchnorm
)
add_subdirectory
(
contraction
)
add_subdirectory
(
pool_fwd
)
add_subdirectory
(
batched_gemm_multi_d
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
add_subdirectory
(
wmma_op
)
endif
()
test/batched_gemm/batched_gemm_bf16.cpp
View file @
91c33aec
...
...
@@ -5,6 +5,8 @@
#include "profiler/profile_batched_gemm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
namespace
{
using
ADataType
=
ck
::
bhalf_t
;
using
BDataType
=
ck
::
bhalf_t
;
...
...
@@ -12,6 +14,8 @@ using CDataType = ck::bhalf_t;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
}
// namespace
int
main
()
...
...
@@ -23,21 +27,87 @@ int main()
bool
pass
=
true
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
using
namespace
ck
::
tensor_operation
::
device
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
std
::
cout
<<
"test BatchedGEMM bf16: "
<<
(
pass
?
"Pass"
:
"Fail"
)
<<
std
::
endl
;
return
pass
?
0
:
1
;
...
...
test/batched_gemm/batched_gemm_fp16.cpp
View file @
91c33aec
...
...
@@ -5,6 +5,8 @@
#include "profiler/profile_batched_gemm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
namespace
{
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
...
...
@@ -12,6 +14,8 @@ using CDataType = ck::half_t;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
}
// namespace
int
main
()
...
...
@@ -23,21 +27,87 @@ int main()
bool
pass
=
true
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
using
namespace
ck
::
tensor_operation
::
device
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
std
::
cout
<<
"test BatchedGEMM fp16: "
<<
(
pass
?
"Pass"
:
"Fail"
)
<<
std
::
endl
;
return
pass
?
0
:
1
;
...
...
test/batched_gemm/batched_gemm_fp32.cpp
View file @
91c33aec
...
...
@@ -5,6 +5,8 @@
#include "profiler/profile_batched_gemm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
namespace
{
using
ADataType
=
float
;
using
BDataType
=
float
;
...
...
@@ -12,6 +14,8 @@ using CDataType = float;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
}
// namespace
int
main
()
...
...
@@ -23,21 +27,87 @@ int main()
bool
pass
=
true
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
using
namespace
ck
::
tensor_operation
::
device
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
std
::
cout
<<
"test BatchedGEMM fp32: "
<<
(
pass
?
"Pass"
:
"Fail"
)
<<
std
::
endl
;
return
pass
?
0
:
1
;
...
...
test/batched_gemm/batched_gemm_int8.cpp
View file @
91c33aec
...
...
@@ -5,6 +5,8 @@
#include "profiler/profile_batched_gemm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
namespace
{
using
ADataType
=
int8_t
;
using
BDataType
=
int8_t
;
...
...
@@ -12,6 +14,8 @@ using CDataType = int8_t;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
}
// namespace
int
main
()
...
...
@@ -23,21 +27,87 @@ int main()
bool
pass
=
true
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
using
namespace
ck
::
tensor_operation
::
device
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
std
::
cout
<<
"test BatchedGEMM int8: "
<<
(
pass
?
"Pass"
:
"Fail"
)
<<
std
::
endl
;
return
pass
?
0
:
1
;
...
...
test/batched_gemm_multi_d/CMakeLists.txt
0 → 100644
View file @
91c33aec
add_test_executable
(
test_batched_gemm_multi_d_fp16 batched_gemm_multi_d_fp16.cpp
)
target_link_libraries
(
test_batched_gemm_multi_d_fp16 PRIVATE utility
)
target_link_libraries
(
test_batched_gemm_multi_d_fp16 PRIVATE device_batched_gemm_multi_d_instance
)
add_test_executable
(
test_batched_gemm_multi_d_int8 batched_gemm_multi_d_int8.cpp
)
target_link_libraries
(
test_batched_gemm_multi_d_int8 PRIVATE utility
)
target_link_libraries
(
test_batched_gemm_multi_d_int8 PRIVATE device_batched_gemm_multi_d_instance
)
test/batched_gemm_multi_d/batched_gemm_multi_d_fp16.cpp
0 → 100644
View file @
91c33aec
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "profiler/profile_batched_gemm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
namespace
{
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
using
CDataType
=
ck
::
half_t
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
}
// namespace
int
main
()
{
int
M
=
512
;
int
N
=
256
;
int
K
=
128
;
int
BatchCount
=
3
;
bool
pass
=
true
;
using
namespace
ck
::
tensor_operation
::
device
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemmMultiD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
ADataType
,
BDataType
,
Empty_Tuple
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemmMultiD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
ADataType
,
BDataType
,
Empty_Tuple
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemmMultiD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
ADataType
,
BDataType
,
Empty_Tuple
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemmMultiD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
ADataType
,
BDataType
,
Empty_Tuple
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
std
::
cout
<<
"test BatchedGEMMMultiD fp16: "
<<
(
pass
?
"Pass"
:
"Fail"
)
<<
std
::
endl
;
return
pass
?
0
:
1
;
}
test/batched_gemm_multi_d/batched_gemm_multi_d_int8.cpp
0 → 100644
View file @
91c33aec
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "profiler/profile_batched_gemm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp"
namespace
{
using
ADataType
=
int8_t
;
using
BDataType
=
int8_t
;
using
CDataType
=
int8_t
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
Empty_Tuple
=
ck
::
Tuple
<>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
}
// namespace
int
main
()
{
int
M
=
256
;
int
N
=
256
;
int
K
=
128
;
int
BatchCount
=
3
;
bool
pass
=
true
;
using
namespace
ck
::
tensor_operation
::
device
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemmMultiD
<
Row
,
Row
,
Empty_Tuple
,
Row
,
ADataType
,
BDataType
,
Empty_Tuple
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemmMultiD
<
Row
,
Col
,
Empty_Tuple
,
Row
,
ADataType
,
BDataType
,
Empty_Tuple
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemmMultiD
<
Col
,
Row
,
Empty_Tuple
,
Row
,
ADataType
,
BDataType
,
Empty_Tuple
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
,
PassThrough
,
PassThrough
,
PassThrough
,
DeviceBatchedGemmMultiD
<
Col
,
Col
,
Empty_Tuple
,
Row
,
ADataType
,
BDataType
,
Empty_Tuple
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
M
*
K
,
K
*
N
,
M
*
N
,
BatchCount
);
std
::
cout
<<
"test BatchedGEMMMultiD int8: "
<<
(
pass
?
"Pass"
:
"Fail"
)
<<
std
::
endl
;
return
pass
?
0
:
1
;
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment