Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
6ceb900b
Commit
6ceb900b
authored
Aug 25, 2022
by
Adam Osewski
Browse files
Batched Gemm int4 example.
parent
09419996
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
166 additions
and
12 deletions
+166
-12
example/24_batched_gemm/CMakeLists.txt
example/24_batched_gemm/CMakeLists.txt
+13
-0
example/24_batched_gemm/batched_gemm_xdl_int4.cpp
example/24_batched_gemm/batched_gemm_xdl_int4.cpp
+99
-0
example/24_batched_gemm/run_batched_gemm_example.inc
example/24_batched_gemm/run_batched_gemm_example.inc
+54
-12
No files found.
example/24_batched_gemm/CMakeLists.txt
View file @
6ceb900b
add_custom_target
(
example_batched_gemm_xdl
)
add_example_executable
(
example_batched_gemm_xdl_fp32 batched_gemm_xdl_fp32.cpp
)
add_example_executable
(
example_batched_gemm_xdl_fp32 batched_gemm_xdl_fp32.cpp
)
add_example_executable
(
example_batched_gemm_xdl_fp16 batched_gemm_xdl_fp16.cpp
)
add_example_executable
(
example_batched_gemm_xdl_fp16 batched_gemm_xdl_fp16.cpp
)
add_example_executable
(
example_batched_gemm_xdl_bfp16 batched_gemm_xdl_bfp16.cpp
)
add_example_executable
(
example_batched_gemm_xdl_bfp16 batched_gemm_xdl_bfp16.cpp
)
add_example_executable
(
example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp
)
add_example_executable
(
example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp
)
add_dependencies
(
example_batched_gemm_xdl
example_batched_gemm_xdl_fp32
example_batched_gemm_xdl_fp16
example_batched_gemm_xdl_bfp16
example_batched_gemm_xdl_int8
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_batched_gemm_xdl_int4 batched_gemm_xdl_int4.cpp
)
add_dependencies
(
example_batched_gemm_xdl example_batched_gemm_xdl_int4
)
endif
()
example/24_batched_gemm/batched_gemm_xdl_int4.cpp
0 → 100644
View file @
6ceb900b
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/utility/literals.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
ck
::
int4_t
;
using
BDataType
=
ck
::
int4_t
;
using
AccDataType
=
int32_t
;
using
CShuffleDataType
=
int32_t
;
using
DsDataType
=
ck
::
Tuple
<>
;
using
EDataType
=
ck
::
int4_t
;
using
KernelADataType
=
int8_t
;
using
KernelBDataType
=
int8_t
;
using
KernelEDataType
=
int8_t
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
DsLayout
=
ck
::
Tuple
<>
;
using
ELayout
=
Row
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceBatchedGemmMultiD_Xdl
// clang-format off
<
ALayout
,
//ALayout
BLayout
,
//BLayout
DsLayout
,
//DsLayout
ELayout
,
//ELayout
KernelADataType
,
//ADataType
KernelBDataType
,
//BDataType
AccDataType
,
//AccDataType
CShuffleDataType
,
//CShuffleDataType
DsDataType
,
//DsDataType
KernelEDataType
,
//EDataType
AElementOp
,
//AElementwiseOperation
BElementOp
,
//BElementwiseOperation
CDEElementOp
,
//CDEElementwiseOperation
GemmDefault
,
//GEMMSpecialization
1
,
// NumGemmKPrefetchStage
256
,
// BlockSize
256
,
// MPerBlock
128
,
// NPerBlock
64
,
// KPerBlock
16
,
// AK1
16
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
4
,
// MXdlPerWave
2
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransfer ThreadCluster Lengths_K0_M_K1
S
<
1
,
0
,
2
>
,
// ABlockTransfer ThreadCluster ArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransfer SrcAccessOrder
2
,
// ABlockTransfer SrcVectorDim
16
,
// ABlockTransfer SrcScalarPerVector
16
,
// ABlockTransfer DstScalarPerVector_K1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransfer ThreadCluster Lengths_K0_N_K1
S
<
1
,
0
,
2
>
,
// BBlockTransfer ThreadCluster ArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransfer SrcAccessOrder
2
,
// BBlockTransfer SrcVectorDim
16
,
// BBlockTransfer SrcScalarPerVector
16
,
// BBlockTransfer DstScalarPerVector_K1
1
,
// BBlockLdsExtraN
1
,
// CShuffleMXdlPerWavePerShuffle
1
,
// CShuffleNXdlPerWavePerShuffle
S
<
1
,
64
,
1
,
4
>
,
// CBlockTransferClusterLengths_MBlock_MWaveMPerXdl_NBlock_NWaveNPerXdl
16
>
;
// CBlockTransferScalarPerVector_NWaveNPerXdl
// clang-format on
#define BUILD_INT4_EXAMPLE
#include "run_batched_gemm_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_batched_gemm_example
(
argc
,
argv
);
}
example/24_batched_gemm/run_batched_gemm_example.inc
View file @
6ceb900b
#include <random>
#pragma once
#pragma once
struct
ProblemSize
final
struct
ProblemSize
final
...
@@ -28,7 +30,23 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -28,7 +30,23 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
{
{
using
namespace
ck
::
literals
;
using
namespace
ck
::
literals
;
auto
&
[
M
,
N
,
K
,
stride_A
,
stride_B
,
stride_C
,
batch_stride_A
,
batch_stride_B
,
batch_stride_C
,
batch_count
]
=
problem_size
;
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
static_assert
(
sizeof
(
ck
::
int4_t
)
==
sizeof
(
int8_t
));
static_assert
(
sizeof
(
ADataType
)
==
sizeof
(
KernelADataType
));
static_assert
(
sizeof
(
BDataType
)
==
sizeof
(
KernelBDataType
));
static_assert
(
sizeof
(
EDataType
)
==
sizeof
(
KernelEDataType
));
#endif
auto
&
[
M
,
N
,
K
,
stride_A
,
stride_B
,
stride_C
,
batch_stride_A
,
batch_stride_B
,
batch_stride_C
,
batch_count
]
=
problem_size
;
// GEMM shape
// GEMM shape
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
batch_count_
,
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
batch_count_
,
...
@@ -53,9 +71,13 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -53,9 +71,13 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
f_host_tensor_descriptor
(
batch_count
,
M
,
K
,
stride_A
,
batch_stride_A
,
ALayout
{}));
f_host_tensor_descriptor
(
batch_count
,
M
,
K
,
stride_A
,
batch_stride_A
,
ALayout
{}));
Tensor
<
BDataType
>
b_g_k_n
(
Tensor
<
BDataType
>
b_g_k_n
(
f_host_tensor_descriptor
(
batch_count
,
K
,
N
,
stride_B
,
batch_stride_B
,
BLayout
{}));
f_host_tensor_descriptor
(
batch_count
,
K
,
N
,
stride_B
,
batch_stride_B
,
BLayout
{}));
#ifdef BUILD_INT4_EXAMPLE
Tensor
<
KernelEDataType
>
e_g_m_n_device_result
(
f_host_tensor_descriptor
(
batch_count
,
M
,
N
,
stride_C
,
batch_stride_C
,
ELayout
{}));
#else
Tensor
<
EDataType
>
e_g_m_n_device_result
(
Tensor
<
EDataType
>
e_g_m_n_device_result
(
f_host_tensor_descriptor
(
batch_count
,
M
,
N
,
stride_C
,
batch_stride_C
,
ELayout
{}));
f_host_tensor_descriptor
(
batch_count
,
M
,
N
,
stride_C
,
batch_stride_C
,
ELayout
{}));
#endif
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
...
@@ -78,9 +100,16 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -78,9 +100,16 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_g_k_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_g_k_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
c_device_buf
(
sizeof
(
EDataType
)
*
e_g_m_n_device_result
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
c_device_buf
(
sizeof
(
EDataType
)
*
e_g_m_n_device_result
.
mDesc
.
GetElementSpaceSize
());
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
KernelADataType
>
a_g_m_k_converted
(
a_g_m_k
);
const
Tensor
<
KernelBDataType
>
b_g_k_n_converted
(
b_g_k_n
);
a_device_buf
.
ToDevice
(
a_g_m_k_converted
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_g_k_n_converted
.
mData
.
data
());
#else
a_device_buf
.
ToDevice
(
a_g_m_k
.
mData
.
data
());
a_device_buf
.
ToDevice
(
a_g_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_g_k_n
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_g_k_n
.
mData
.
data
());
#endif
auto
a_element_op
=
AElementOp
{};
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
cde_element_op
=
CDEElementOp
{};
auto
cde_element_op
=
CDEElementOp
{};
...
@@ -123,10 +152,8 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -123,10 +152,8 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
sizeof
(
BDataType
)
*
batch_count
*
K
*
N
+
sizeof
(
BDataType
)
*
batch_count
*
K
*
N
+
sizeof
(
EDataType
)
*
batch_count
*
M
*
N
;
sizeof
(
EDataType
)
*
batch_count
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
...
@@ -136,8 +163,14 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -136,8 +163,14 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
{
{
c_device_buf
.
FromDevice
(
e_g_m_n_device_result
.
mData
.
data
());
c_device_buf
.
FromDevice
(
e_g_m_n_device_result
.
mData
.
data
());
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
using
ReferenceBatchedGemmInstance
=
ReferenceBatchedGemm
<
ADataType
,
BDataType
,
EDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
>
;
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
BDataType
,
EDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
>
;
auto
ref_batched_gemm
=
ReferenceBatchedGemmInstance
{};
auto
ref_batched_gemm
=
ReferenceBatchedGemmInstance
{};
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
...
@@ -150,8 +183,14 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -150,8 +183,14 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
EDataType
>
e_device_result_converted
(
e_g_m_n_device_result
);
pass
&=
ck
::
utils
::
check_err
(
e_device_result_converted
.
mData
,
e_g_m_n_host_result
.
mData
);
#else
pass
=
ck
::
utils
::
check_err
(
pass
=
ck
::
utils
::
check_err
(
e_g_m_n_host_result
.
mData
,
e_g_m_n_device_result
.
mData
,
"Error: Incorrect results c"
);
e_g_m_n_device_result
.
mData
,
e_g_m_n_host_result
.
mData
,
"Error: Incorrect results c"
);
#endif
}
}
return
pass
?
0
:
1
;
return
pass
?
0
:
1
;
...
@@ -162,9 +201,12 @@ bool run_batched_gemm_example(int argc, char* argv[])
...
@@ -162,9 +201,12 @@ bool run_batched_gemm_example(int argc, char* argv[])
ProblemSize
problem_size
;
ProblemSize
problem_size
;
ExecutionConfig
config
;
ExecutionConfig
config
;
problem_size
.
M
=
256
*
(
rand
()
%
16
+
1
);
std
::
mt19937
gen
(
11939
);
problem_size
.
N
=
128
*
(
rand
()
%
16
+
1
);
std
::
uniform_int_distribution
<
int
>
dis
(
0
,
15
);
problem_size
.
K
=
64
*
(
rand
()
%
16
+
1
);
problem_size
.
M
=
256
*
(
dis
(
gen
)
+
1
);
problem_size
.
N
=
128
*
(
dis
(
gen
)
+
1
);
problem_size
.
K
=
64
*
(
dis
(
gen
)
+
2
);
problem_size
.
stride_A
=
problem_size
.
K
;
problem_size
.
stride_A
=
problem_size
.
K
;
problem_size
.
stride_B
=
problem_size
.
K
;
problem_size
.
stride_B
=
problem_size
.
K
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment