Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4d38b385
Commit
4d38b385
authored
Aug 24, 2022
by
Adam Osewski
Browse files
Grouped GEmm int4.
parent
fa2d894b
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
148 additions
and
3 deletions
+148
-3
example/15_grouped_gemm/CMakeLists.txt
example/15_grouped_gemm/CMakeLists.txt
+13
-0
example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
+101
-0
example/15_grouped_gemm/run_grouped_gemm_example.inc
example/15_grouped_gemm/run_grouped_gemm_example.inc
+34
-3
No files found.
example/15_grouped_gemm/CMakeLists.txt
View file @
4d38b385
add_custom_target
(
example_grouped_gemm_xdl
)
add_example_executable
(
example_grouped_gemm_xdl_fp32 grouped_gemm_xdl_fp32.cpp
)
add_example_executable
(
example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp
)
add_example_executable
(
example_grouped_gemm_xdl_bfp16 grouped_gemm_xdl_bfp16.cpp
)
add_example_executable
(
example_grouped_gemm_xdl_int8 grouped_gemm_xdl_int8.cpp
)
add_dependencies
(
example_grouped_gemm_xdl
example_grouped_gemm_xdl_fp32
example_grouped_gemm_xdl_fp16
example_grouped_gemm_xdl_bfp16
example_grouped_gemm_xdl_int8
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp
)
add_dependencies
(
example_grouped_gemm_xdl example_grouped_gemm_xdl_int4
)
endif
()
example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
0 → 100644
View file @
4d38b385
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
ck
::
int4_t
;
using
BDataType
=
ck
::
int4_t
;
using
AccDataType
=
int32_t
;
using
CShuffleDataType
=
int32_t
;
using
DsDataType
=
ck
::
Tuple
<>
;
using
EDataType
=
ck
::
int4_t
;
using
KernelADataType
=
int8_t
;
using
KernelBDataType
=
int8_t
;
using
KernelEDataType
=
int8_t
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
DsLayout
=
ck
::
Tuple
<>
;
using
ELayout
=
Row
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedGemm_Xdl
// clang-format off
<
ALayout
,
//ALayout
BLayout
,
//BLayout
DsLayout
,
//DsLayout
ELayout
,
//ELayout
KernelADataType
,
//ADataType
KernelBDataType
,
//BDataType
AccDataType
,
//AccDataType
CShuffleDataType
,
//CShuffleDataType
DsDataType
,
//DsDataType
KernelEDataType
,
//EDataType
AElementOp
,
//AElementwiseOperation
BElementOp
,
//BElementwiseOperation
CDEElementOp
,
//CDEElementwiseOperation
GemmDefault
,
//GEMMSpecialization
1
,
// NumGemmKPrefetchStage
256
,
// BlockSize
256
,
// MPerBlock
128
,
// NPerBlock
64
,
// KPerBlock
16
,
// AK1
16
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
4
,
// MXdlPerWave
2
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransfer ThreadCluster Lengths_K0_M_K1
S
<
1
,
0
,
2
>
,
// ABlockTransfer ThreadCluster ArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransfer SrcAccessOrder
2
,
// ABlockTransfer SrcVectorDim
16
,
// ABlockTransfer SrcScalarPerVector
16
,
// ABlockTransfer DstScalarPerVector_K1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransfer ThreadCluster Lengths_K0_N_K1
S
<
1
,
0
,
2
>
,
// BBlockTransfer ThreadCluster ArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransfer SrcAccessOrder
2
,
// BBlockTransfer SrcVectorDim
16
,
// BBlockTransfer SrcScalarPerVector
16
,
// BBlockTransfer DstScalarPerVector_K1
1
,
// BBlockLdsExtraN
1
,
// CShuffleMXdlPerWavePerShuffle
1
,
// CShuffleNXdlPerWavePerShuffle
S
<
1
,
64
,
1
,
4
>
,
// CBlockTransferClusterLengths_MBlock_MWaveMPerXdl_NBlock_NWaveNPerXdl
16
>
;
// CBlockTransferScalarPerVector_NWaveNPerXdl
// clang-format on
#define BUILD_INT4_EXAMPLE
#include "run_grouped_gemm_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_grouped_gemm_example
(
argc
,
argv
);
}
example/15_grouped_gemm/run_grouped_gemm_example.inc
View file @
4d38b385
...
...
@@ -22,6 +22,12 @@ struct ExecutionConfig final
bool
run_grouped_gemm
(
const
ProblemSize
&
problem_size
,
const
ExecutionConfig
&
config
)
{
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
static_assert
(
sizeof
(
ck
::
int4_t
)
==
sizeof
(
int8_t
));
static_assert
(
sizeof
(
ADataType
)
==
sizeof
(
KernelADataType
));
static_assert
(
sizeof
(
BDataType
)
==
sizeof
(
KernelBDataType
));
static_assert
(
sizeof
(
EDataType
)
==
sizeof
(
KernelEDataType
));
#endif
int
group_count
=
problem_size
.
group_count
;
// GEMM shape
...
...
@@ -61,7 +67,11 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
std
::
vector
<
Tensor
<
ADataType
>>
a_tensors
;
std
::
vector
<
Tensor
<
BDataType
>>
b_tensors
;
std
::
vector
<
Tensor
<
EDataType
>>
c_host_tensors
;
#ifdef BUILD_INT4_EXAMPLE
std
::
vector
<
Tensor
<
KernelEDataType
>>
c_device_tensors
;
#else
std
::
vector
<
Tensor
<
EDataType
>>
c_device_tensors
;
#endif
a_tensors
.
reserve
(
group_count
);
b_tensors
.
reserve
(
group_count
);
...
...
@@ -86,9 +96,13 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
gemm_descs
[
i
]
.
K_
,
gemm_descs
[
i
]
.
N_
,
gemm_descs
[
i
]
.
stride_B_
,
BLayout
{})));
c_host_tensors
.
push_back
(
Tensor
<
EDataType
>
(
f_host_tensor_descriptor
(
gemm_descs
[
i
]
.
M_
,
gemm_descs
[
i
]
.
N_
,
gemm_descs
[
i
]
.
stride_C_
,
ELayout
{})));
#ifdef BUILD_INT4_EXAMPLE
c_device_tensors
.
push_back
(
Tensor
<
KernelEDataType
>
(
f_host_tensor_descriptor
(
gemm_descs
[
i
]
.
M_
,
gemm_descs
[
i
]
.
N_
,
gemm_descs
[
i
]
.
stride_C_
,
ELayout
{})));
#else
c_device_tensors
.
push_back
(
Tensor
<
EDataType
>
(
f_host_tensor_descriptor
(
gemm_descs
[
i
]
.
M_
,
gemm_descs
[
i
]
.
N_
,
gemm_descs
[
i
]
.
stride_C_
,
ELayout
{})));
#endif
std
::
cout
<<
"gemm["
<<
i
<<
"] a_m_k: "
<<
a_tensors
[
i
]
.
mDesc
<<
" b_k_n: "
<<
b_tensors
[
i
]
.
mDesc
<<
" c_m_n: "
<<
c_device_tensors
[
i
]
.
mDesc
<<
std
::
endl
;
...
...
@@ -124,8 +138,16 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
c_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
EDataType
)
*
c_device_tensors
[
i
]
.
mDesc
.
GetElementSpaceSize
()));
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
KernelADataType
>
a_converted
(
a_tensors
[
i
]);
const
Tensor
<
KernelBDataType
>
b_converted
(
b_tensors
[
i
]);
a_tensors_device
[
i
]
->
ToDevice
(
a_converted
.
mData
.
data
());
b_tensors_device
[
i
]
->
ToDevice
(
b_converted
.
mData
.
data
());
#else
a_tensors_device
[
i
]
->
ToDevice
(
a_tensors
[
i
]
.
mData
.
data
());
b_tensors_device
[
i
]
->
ToDevice
(
b_tensors
[
i
]
.
mData
.
data
());
#endif
p_a
.
push_back
(
a_tensors_device
[
i
]
->
GetDeviceBuffer
());
p_b
.
push_back
(
b_tensors_device
[
i
]
->
GetDeviceBuffer
());
...
...
@@ -157,9 +179,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
...
...
@@ -190,7 +210,14 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
EDataType
>
c_device_result_converted
(
c_device_tensors
[
i
]);
pass
&=
ck
::
utils
::
check_err
(
c_device_result_converted
.
mData
,
c_host_tensors
[
i
]
.
mData
);
#else
pass
&=
ck
::
utils
::
check_err
(
c_device_tensors
[
i
]
.
mData
,
c_host_tensors
[
i
]
.
mData
);
#endif
}
}
...
...
@@ -208,7 +235,11 @@ bool run_grouped_gemm_example(int argc, char* argv[])
{
problem_size
.
Ms
.
push_back
(
256
+
256
*
i
);
problem_size
.
Ns
.
push_back
(
128
+
128
*
i
);
#ifdef BUILD_INT4_EXAMPLE
problem_size
.
Ks
.
push_back
(
128
+
64
*
i
);
#else
problem_size
.
Ks
.
push_back
(
64
+
64
*
i
);
#endif
problem_size
.
stride_As
.
push_back
(
problem_size
.
Ks
[
i
]);
problem_size
.
stride_Bs
.
push_back
(
problem_size
.
Ks
[
i
]);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment