Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
f27c50a7
Commit
f27c50a7
authored
Mar 22, 2024
by
Jakub Piasecki
Browse files
working version
parent
0ea428c3
Changes
10
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
614 additions
and
67 deletions
+614
-67
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
...grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+31
-16
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
...u/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+8
-5
library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
...ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+13
-13
library/include/ck/library/utility/check_err.hpp
library/include/ck/library/utility/check_err.hpp
+6
-6
library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
...tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
+1
-1
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_multiple_d_splitk_xdl_two_stage_f16_f16_f16_mk_kn_mn_instance.cpp
..._d_splitk_xdl_two_stage_f16_f16_f16_mk_kn_mn_instance.cpp
+24
-22
profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
...r/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
+5
-4
profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp
.../include/profiler/profile_grouped_gemm_two_stage_impl.hpp
+363
-0
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+1
-0
profiler/src/profile_grouped_gemm_two_stage.cpp
profiler/src/profile_grouped_gemm_two_stage.cpp
+162
-0
No files found.
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
View file @
f27c50a7
...
...
@@ -153,6 +153,9 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
PipelineVer
,
ComputeDataType
>
;
// CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
// indexy 1,3 -> MPerBlock, NPerBlock || podzielone przez MPerBlock -> NPerThread
template
<
typename
ELay
>
static
auto
MakeEGridDescriptor_M_N
(
index_t
M
,
index_t
N
,
index_t
StrideE
)
{
...
...
@@ -216,10 +219,8 @@ template <typename ELay>
static
constexpr
auto
MakeElementwiseInputSequence
()
{
return
generate_sequence_v2
(
[
&
](
auto
i
)
constexpr
{
return
Number
<
i
+
1
-
i
>
{};
},
[
&
](
[[
maybe_unused
]]
auto
i
)
constexpr
{
return
Number
<
CDEShuffleBlockTransferScalarPerVector_NPerBlock
>
{};
},
Number
<
NumDTensor
+
1
>
{});
//CShuffleNXdlPerWavePerShuffle
}
using
CGridDesc_M_N
=
typename
GridwiseGemm
::
CGridDesc_M_N
;
...
...
@@ -227,20 +228,21 @@ template <typename ELay>
using
DsGridDesc_M_N
=
decltype
(
MakeDsGridDescriptor_M_N
({},
{},
{}));
using
DsGridPointer
=
decltype
(
MakeDsGridPointer
());
using
CDGridDesc_M_N
=
decltype
(
concat_tuple
(
ck
::
Tuple
<
CGridDesc_M_N
>
{},
DsGridDesc_M_N
{}));
//using CDDataTypes = decltype(concat_tuple(ck::Tuple<WorkspaceDataType*>{}, DsDataType{}));
using
CDDataTypes
=
decltype
(
concat_tuple
(
ck
::
Tuple
<
WorkspaceDataType
*>
{},
DsGridPointer
{}));
using
ElementwiseInputSequence
=
decltype
(
MakeElementwiseInputSequence
());
using
GridwiseElementwise
=
GridwiseElementwise_2D
<
CDGridDesc_M_N
,
// zmien na C, D_0, ..., D_n / tuple<C, D_0, ..., D_N>
static
constexpr
index_t
ClusterLengthMPerBlock
=
CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
::
At
(
1
);
static
constexpr
index_t
ClusterLengthNPerBlock
=
CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
::
At
(
3
);
using
GridwiseElementwise
=
GridwiseElementwise_2D
<
CDGridDesc_M_N
,
ck
::
Tuple
<
EGridDesc_M_N
>
,
CDDataTypes
,
// zmien na C, D_0, ..., D_n / tuple<C, D_0, ..., D_N>
CDDataTypes
,
ck
::
Tuple
<
EDataType
*>
,
CDEElementwiseOperation
,
CDEShuffleBlockTransferScalarPerVector_NPerBlock
,
// MPerThread
CDEShuffleBlockTransferScalarPerVector_NPerBlock
,
// NPerThread
MPerBlock
/
ClusterLengthMPerBlock
,
NPerBlock
/
ClusterLengthNPerBlock
,
ElementwiseInputSequence
,
ck
::
Sequence
<
8
>>
;
ck
::
Sequence
<
CDEShuffleBlockTransferScalarPerVector_NPerBlock
>>
;
using
Block2ETileMapKSplit
=
BlockToCTileMap_KSplit_M00_N0_M01Adapt
<
MPerBlock
,
NPerBlock
,
CGridDesc_M_N
>
;
...
...
@@ -324,10 +326,9 @@ template <typename ELay>
gemm_kernel_args_
.
reserve
(
group_count_
);
elementwise_c_grid_descs_m_n_
.
reserve
(
group_count_
);
elementwise_d_grid_descs_m_n_
.
reserve
(
group_count_
);
ds_grid_pointer_
.
reserve
(
group_count_
);
group_grid_size_
.
reserve
(
group_count_
);
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
++
i
)
{
...
...
@@ -371,6 +372,7 @@ template <typename ELay>
const
index_t
block_end
=
grid_size_
+
grid_size_grp
;
grid_size_
+=
grid_size_grp
;
group_grid_size_
[
i
]
=
grid_size_grp
;
// block-to-e-tile map
auto
grouped_block_2_ctile_map
=
GroupedGemmBlock2ETileMap
(
local_b2c_tile_map
,
block_start
);
...
...
@@ -449,12 +451,25 @@ template <typename ELay>
auto
grouped_block_2_ctile_map
=
GroupedGemmBlock2ETileMap
(
local_b2c_tile_map
,
block_start
);
group_grid_size_
[
i
]
=
grid_size_grp
;
karg
.
KPadded
=
k_padded
;
karg
.
K0Padded
=
k0_padded
;
karg
.
k_batch
=
K_BATCH
;
gemm_kernel_args_
[
i
].
block_2_ctile_map_
=
grouped_block_2_ctile_map
;
gemm_kernel_args_
[
i
].
block_start_
=
block_start
;
gemm_kernel_args_
[
i
].
block_end_
=
block_end
;
#if DEBUG_LOG
index_t
tiles
=
(
block_end
-
block_start
)
/
K_BATCH
;
std
::
cout
<<
"block_start: "
<<
block_start
<<
"
\n
"
<<
"block_end: "
<<
block_end
<<
"
\n
"
<<
"tiles: "
<<
tiles
<<
std
::
endl
<<
std
::
endl
;
std
::
cout
<<
"KPadded: "
<<
karg
.
KPadded
<<
std
::
endl
<<
"K0Padded: "
<<
karg
.
K0Padded
<<
std
::
endl
<<
"KBatch: "
<<
karg
.
k_batch
<<
std
::
endl
<<
"grid_size_: "
<<
karg
.
KPadded
<<
std
::
endl
;
#endif
}
}
...
...
@@ -515,6 +530,7 @@ template <typename ELay>
std
::
vector
<
std
::
array
<
const
void
*
,
NumDTensor
>>&
p_Ds_
;
std
::
vector
<
std
::
array
<
index_t
,
NumDTensor
>>
stride_Ds_
;
std
::
vector
<
GemmTransKernelArg
>
gemm_kernel_args_
;
std
::
vector
<
index_t
>
group_grid_size_
;
std
::
vector
<
CGridDesc_M_N
>
elementwise_c_grid_descs_m_n_
;
std
::
vector
<
DsGridDesc_M_N
>
elementwise_d_grid_descs_m_n_
;
...
...
@@ -755,7 +771,7 @@ template <typename ELay>
for
(
int
i
=
0
;
i
<
arg
.
group_count_
;
++
i
)
{
time
+=
launch_and_time_kernel
(
stream_config
,
elementwise_kernel
,
dim3
(
arg
.
grid_size_
),
// chyba group_grid_size <<< tak zmienic na group_grid_size[i]
dim3
(
arg
.
group_
grid_size_
[
i
]
),
// chyba group_grid_size <<< tak zmienic na group_grid_size[i]
dim3
(
BlockSize
),
0
,
concat_tuple
(
make_tuple
(
arg
.
elementwise_c_grid_descs_m_n_
[
i
]),
arg
.
elementwise_d_grid_descs_m_n_
[
i
]),
...
...
@@ -763,10 +779,9 @@ template <typename ELay>
concat_tuple
(
make_tuple
(
arg
.
gemm_kernel_args_
[
i
].
karg_
.
p_c_grid
),
arg
.
ds_grid_pointer_
[
i
]),
type_convert
<
EDataType
*>
(
arg
.
e_ptrs_
[
i
]),
arg
.
cde_element_op_
,
C
DEShuffleBlockTransferScalarPerVector_N
PerBlock
,
// num_threads_m
C
DEShuffleBlockTransferScalarPerVector_
NPerBlock
);
// num_threads_n
C
lusterLengthM
PerBlock
,
// num_threads_m
C
lusterLength
NPerBlock
);
// num_threads_n
}
return
time
;
}
};
...
...
@@ -870,7 +885,7 @@ template <typename ELay>
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceGroupedGemm
_XdlSplitKTileLoop
"
str
<<
"DeviceGroupedGemm
MultipleDSplitKXdlCShuffleTwoStage
"
<<
"<"
<<
std
::
string
(
ALayout
::
name
)[
0
]
<<
","
<<
std
::
string
(
BLayout
::
name
)[
0
]
<<
","
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
View file @
f27c50a7
...
...
@@ -37,9 +37,9 @@ __global__ void
kernel_grouped_gemm_xdl_splitk
(
const
void
CK_CONSTANT_ADDRESS_SPACE
*
gemm_descs_const
,
const
index_t
group_count
,
const
AElementwiseOperation
a_element_op
=
AElementwiseOperation
{}
,
const
BElementwiseOperation
b_element_op
=
BElementwiseOperation
{}
,
const
CElementwiseOperation
c_element_op
=
CElementwiseOperation
{}
)
const
AElementwiseOperation
a_element_op
,
const
BElementwiseOperation
b_element_op
,
const
CElementwiseOperation
c_element_op
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
defined(__gfx94__))
...
...
@@ -206,7 +206,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
static
constexpr
index_t
B2E_M01
=
8
;
using
GroupedGemmBlock2ETileMap
=
OffsettedBlockToCTileMap
<
Block2ETileMapKSplit
>
;
using
KernelArgument
=
typename
GridwiseGemm
::
Argument
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
struct
GemmTransKernelArg
{
KernelArgument
karg_
;
...
...
@@ -450,7 +450,10 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
dim3
(
BlockSize
),
0
,
cast_pointer_to_constant_address_space
(
arg
.
p_workspace_
),
arg
.
gemm_kernel_args_
.
size
());
arg
.
gemm_kernel_args_
.
size
(),
PassThrough
{},
PassThrough
{},
PassThrough
{});
};
if
(
all_have_main_k0_block_loop
)
...
...
library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
View file @
f27c50a7
...
...
@@ -146,18 +146,18 @@ void add_device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instances(
PassThrough
,
PassThrough
>>>&
instances
);
//
void add_device_grouped_gemm_multiple_d_xdl_two_stage_f16_f16_f16_mk_kn_mn_instances(
//
std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
//
Row,
//
Empty_Tuple,
//
Row,
//
F16,
//
F16,
//
Empty_Tuple,
//
F16,
//
PassThrough,
//
PassThrough,
//
PassThrough>>>& instances)
void
add_device_grouped_gemm_multiple_d_xdl_two_stage_f16_f16_f16_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemm
<
Row
,
Row
,
Empty_Tuple
,
Row
,
F16
,
F16
,
Empty_Tuple
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
)
;
template
<
typename
ALayout
,
typename
BLayout
,
...
...
@@ -203,7 +203,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances
(
op_ptrs
);
add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances
(
op_ptrs
);
//
add_device_grouped_gemm_multiple_d_xdl_two_stage_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
add_device_grouped_gemm_multiple_d_xdl_two_stage_f16_f16_f16_mk_kn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
ELayout
,
Row
>
)
...
...
library/include/ck/library/utility/check_err.hpp
View file @
f27c50a7
...
...
@@ -55,7 +55,7 @@ check_err(const Range& out,
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
if
(
err_count
<
5
0000
)
{
std
::
cerr
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
...
...
@@ -106,7 +106,7 @@ check_err(const Range& out,
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
if
(
err_count
<
5
0000
)
{
std
::
cerr
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
...
...
@@ -156,7 +156,7 @@ check_err(const Range& out,
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
if
(
err_count
<
5
0000
)
{
std
::
cerr
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
...
...
@@ -211,7 +211,7 @@ check_err(const Range& out,
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
if
(
err_count
<
5
0000
)
{
std
::
cerr
<<
msg
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
...
...
@@ -260,7 +260,7 @@ check_err(const Range& out,
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
if
(
err_count
<
5
0000
)
{
std
::
cerr
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
...
...
@@ -305,7 +305,7 @@ check_err(const Range& out,
{
max_err
=
err
>
max_err
?
err
:
max_err
;
err_count
++
;
if
(
err_count
<
5
)
if
(
err_count
<
5
0000
)
{
std
::
cerr
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
...
...
library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
View file @
f27c50a7
...
...
@@ -9,5 +9,5 @@ add_instance_library(device_grouped_gemm_instance
device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp
device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instance.cpp
#
device_grouped_gemm_multiple_d_splitk_xdl_two_stage_f16_f16_f16_mk_kn_mn_instance.cpp
device_grouped_gemm_multiple_d_splitk_xdl_two_stage_f16_f16_f16_mk_kn_mn_instance.cpp
)
library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_multiple_d_splitk_xdl_two_stage_f16_f16_f16_mk_kn_mn_instance.cpp
View file @
f27c50a7
This diff is collapsed.
Click to expand it.
profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
View file @
f27c50a7
...
...
@@ -270,15 +270,16 @@ bool profile_grouped_gemm_fixed_nk_impl(int do_verification,
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
c_device_buf
[
i
]
->
SetZero
();
std
::
cout
<<
"p1
\n
"
;
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
,
0
,
n_warmup
,
n_iter
});
std
::
cout
<<
"p2
\n
"
;
if
(
do_verification
)
{
bool
instance_pass
=
true
;
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
std
::
cout
<<
"p3
\n
"
;
c_device_buf
[
i
]
->
FromDevice
(
c_m_n_device_results
[
i
].
mData
.
data
());
if
(
std
::
is_same_v
<
CDataType
,
ck
::
half_t
>
&&
kbatch_curr
>
1
)
...
...
@@ -316,10 +317,10 @@ bool profile_grouped_gemm_fixed_nk_impl(int do_verification,
pass
=
pass
&&
instance_pass
;
}
std
::
cout
<<
"p4
\n
"
;
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
,
0
,
n_warmup
,
n_iter
});
std
::
cout
<<
"p5
\n
"
;
if
(
time_kernel
)
{
std
::
size_t
flop
=
0
,
num_btype
=
0
;
...
...
profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp
0 → 100644
View file @
f27c50a7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_splitk.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_multiple_d_splitk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AccDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
bool
profile_grouped_gemm_two_stage_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
const
std
::
vector
<
int
>&
Ms
,
const
std
::
vector
<
int
>&
Ns
,
const
std
::
vector
<
int
>&
Ks
,
const
std
::
vector
<
int
>&
StrideAs
,
const
std
::
vector
<
int
>&
StrideBs
,
const
std
::
vector
<
int
>&
StrideCs
,
int
kbatch
=
1
,
int
n_warmup
=
1
,
int
n_iter
=
10
)
{
bool
pass
=
true
;
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
is_same
<
decltype
(
layout
),
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
}
};
std
::
size_t
group_count
=
Ms
.
size
();
if
(
!
(
group_count
==
Ns
.
size
()
&&
group_count
==
Ks
.
size
()
&&
group_count
==
StrideAs
.
size
()
&&
group_count
==
StrideBs
.
size
()
&&
group_count
==
StrideCs
.
size
()))
{
throw
std
::
runtime_error
(
"wrong! inconsistent M/N/Ks, StrideA/B/Cs size
\n
"
);
}
std
::
vector
<
Tensor
<
ADataType
>>
a_m_k
;
std
::
vector
<
Tensor
<
BDataType
>>
b_k_n
;
std
::
vector
<
Tensor
<
CDataType
>>
c_m_n_host_results
;
std
::
vector
<
Tensor
<
CDataType
>>
c_m_n_device_results
;
for
(
std
::
size_t
i
=
0
;
i
<
group_count
;
i
++
)
{
a_m_k
.
push_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ks
[
i
],
StrideAs
[
i
],
ALayout
{})));
b_k_n
.
push_back
(
Tensor
<
BDataType
>
(
f_host_tensor_descriptor
(
Ks
[
i
],
Ns
[
i
],
StrideBs
[
i
],
BLayout
{})));
c_m_n_device_results
.
push_back
(
Tensor
<
CDataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideCs
[
i
],
CLayout
{})));
c_m_n_host_results
.
push_back
(
Tensor
<
CDataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideCs
[
i
],
CLayout
{})));
#if DEBUG_LOG
std
::
cout
<<
"group: "
<<
i
<<
" a_m_k["
<<
i
<<
"]:"
<<
a_m_k
[
i
].
mDesc
<<
", b_k_n["
<<
i
<<
"]:"
<<
b_k_n
[
i
].
mDesc
<<
", c_m_n_device_results["
<<
i
<<
"]:"
<<
c_m_n_device_results
[
i
].
mDesc
<<
std
::
endl
;
#endif // DEBUG_LOG
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
a_m_k
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
},
num_thread
);
b_k_n
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
a_m_k
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
},
num_thread
);
b_k_n
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
},
num_thread
);
}
}
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
c_element_op
=
CElementOp
{};
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
std
::
vector
<
DeviceMemPtr
>
a_device_buf
,
b_device_buf
,
c_device_buf
;
a_device_buf
.
reserve
(
group_count
);
b_device_buf
.
reserve
(
group_count
);
c_device_buf
.
reserve
(
group_count
);
std
::
vector
<
const
void
*>
p_a
,
p_b
;
std
::
vector
<
void
*>
p_c
;
p_a
.
reserve
(
group_count
);
p_b
.
reserve
(
group_count
);
p_c
.
reserve
(
group_count
);
std
::
vector
<
ck
::
tensor_operation
::
device
::
GemmDesc
>
gemm_descs
;
gemm_descs
.
reserve
(
group_count
);
for
(
std
::
size_t
i
=
0
;
i
<
group_count
;
i
++
)
{
a_device_buf
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_m_k
[
i
].
mDesc
.
GetElementSpaceSize
()));
b_device_buf
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
BDataType
)
*
b_k_n
[
i
].
mDesc
.
GetElementSpaceSize
()));
c_device_buf
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
CDataType
)
*
c_m_n_device_results
[
i
].
mDesc
.
GetElementSpaceSize
()));
a_device_buf
[
i
]
->
ToDevice
(
a_m_k
[
i
].
mData
.
data
());
b_device_buf
[
i
]
->
ToDevice
(
b_k_n
[
i
].
mData
.
data
());
gemm_descs
.
push_back
({
Ms
[
i
],
Ns
[
i
],
Ks
[
i
],
StrideAs
[
i
],
StrideBs
[
i
],
StrideCs
[
i
],
{}});
p_a
.
push_back
(
a_device_buf
[
i
]
->
GetDeviceBuffer
());
p_b
.
push_back
(
b_device_buf
[
i
]
->
GetDeviceBuffer
());
p_c
.
push_back
(
c_device_buf
[
i
]
->
GetDeviceBuffer
());
}
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGroupedGemm
<
ALayout
,
BLayout
,
ck
::
Tuple
<>
,
CLayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<>
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
if
(
op_ptrs
.
size
()
<=
0
)
{
throw
std
::
runtime_error
(
"wrong! no device GEMM instance found"
);
}
std
::
string
best_gemm_name
;
float
best_ave_time
=
0
;
float
best_tflops
=
0
;
float
best_gb_per_sec
=
0
;
float
best_kbatch
=
0
;
auto
p_ds
=
std
::
vector
<
std
::
array
<
const
void
*
,
0
>>
{};
if
(
do_verification
)
{
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_m_k
[
i
],
b_k_n
[
i
],
c_m_n_host_results
[
i
],
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
}
}
// profile device GEMM instances
for
(
auto
&
gemm_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
gemm_ptr
->
MakeArgumentPointer
(
p_a
,
p_b
,
p_ds
,
p_c
,
gemm_descs
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{},
ck
::
tensor_operation
::
element_wise
::
PassThrough
{});
auto
invoker_ptr
=
gemm_ptr
->
MakeInvokerPointer
();
DeviceMem
gemm_desc_workspace
(
gemm_ptr
->
GetWorkSpaceSize
(
argument_ptr
.
get
()));
gemm_ptr
->
SetWorkSpacePointer
(
argument_ptr
.
get
(),
gemm_desc_workspace
.
GetDeviceBuffer
());
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
using
DeviceOpSplitK
=
ck
::
tensor_operation
::
device
::
DeviceGroupedGemmMultipleDSplitK
<
ALayout
,
BLayout
,
ck
::
Tuple
<>
,
CLayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<>
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
// skip non-splitk grouped_gemm
if
(
dynamic_cast
<
DeviceOpSplitK
*>
(
gemm_ptr
.
get
())
==
nullptr
)
{
continue
;
}
std
::
vector
<
int
>
kbatch_list
=
{
1
,
2
,
4
,
8
,
12
,
16
,
20
,
24
,
32
,
48
,
64
};
if
(
kbatch
>
0
)
{
kbatch_list
=
{
kbatch
};
}
for
(
std
::
size_t
j
=
0
;
j
<
kbatch_list
.
size
();
j
++
)
{
auto
kbatch_curr
=
kbatch_list
[
j
];
dynamic_cast
<
DeviceOpSplitK
*>
(
gemm_ptr
.
get
())
->
SetKBatchSize
(
argument_ptr
.
get
(),
kbatch_curr
);
DeviceMem
gemm_arg_dev_mem
(
dynamic_cast
<
DeviceOpSplitK
*>
(
gemm_ptr
.
get
())
->
GetDeviceKernelArgSize
(
argument_ptr
.
get
()));
dynamic_cast
<
DeviceOpSplitK
*>
(
gemm_ptr
.
get
())
->
SetDeviceKernelArgs
(
argument_ptr
.
get
(),
gemm_arg_dev_mem
.
GetDeviceBuffer
());
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
gemm_desc_workspace
.
SetZero
();
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
c_device_buf
[
i
]
->
SetZero
();
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
,
0
,
n_warmup
,
n_iter
});
if
(
do_verification
)
{
bool
instance_pass
=
true
;
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
c_device_buf
[
i
]
->
FromDevice
(
c_m_n_device_results
[
i
].
mData
.
data
());
if
(
std
::
is_same_v
<
CDataType
,
ck
::
half_t
>
&&
kbatch_curr
>
1
)
{
instance_pass
=
instance_pass
&&
ck
::
utils
::
check_err
(
c_m_n_device_results
[
i
],
c_m_n_host_results
[
i
],
"Error: Incorrect results!"
,
0.06
);
}
else
{
instance_pass
=
instance_pass
&&
ck
::
utils
::
check_err
(
c_m_n_device_results
[
i
],
c_m_n_host_results
[
i
]);
}
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a : "
,
a_m_k
[
i
].
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b: "
,
b_k_n
[
i
].
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_device: "
,
c_m_n_device_results
[
i
].
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_host : "
,
c_m_n_host_results
[
i
].
mData
,
","
)
<<
std
::
endl
;
}
}
std
::
cout
<<
"Instance: "
<<
gemm_name
<<
" verification "
<<
(
instance_pass
?
"SUCCEED"
:
"FAILED"
)
<<
std
::
endl
;
pass
=
pass
&&
instance_pass
;
}
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
,
0
,
n_warmup
,
n_iter
});
if
(
time_kernel
)
{
std
::
size_t
flop
=
0
,
num_btype
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
flop
+=
std
::
size_t
(
2
)
*
Ms
[
i
]
*
Ns
[
i
]
*
Ks
[
i
];
num_btype
+=
sizeof
(
ADataType
)
*
Ms
[
i
]
*
Ks
[
i
]
+
sizeof
(
BDataType
)
*
Ks
[
i
]
*
Ns
[
i
]
+
sizeof
(
CDataType
)
*
Ms
[
i
]
*
Ns
[
i
];
}
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm_name
<<
", KBatch "
<<
kbatch_curr
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_gemm_name
=
gemm_name
;
best_tflops
=
tflops
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
best_kbatch
=
kbatch_curr
;
}
}
}
else
{
std
::
cout
<<
"Instance: "
<<
gemm_name
<<
", does not support this GEMM problem"
<<
std
::
endl
;
}
}
}
if
(
time_kernel
)
{
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gemm_name
<<
", KBatch = "
<<
best_kbatch
<<
std
::
endl
;
}
return
pass
;
}
}
// namespace profiler
}
// namespace ck
profiler/src/CMakeLists.txt
View file @
f27c50a7
...
...
@@ -54,6 +54,7 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_two_stage.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
...
...
profiler/src/profile_grouped_gemm_two_stage.cpp
0 → 100644
View file @
f27c50a7
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_grouped_gemm_two_stage_impl.hpp"
#include "profiler_operation_registry.hpp"
enum
struct
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
};
enum
struct
GemmDataType
{
F32_F32_F32
,
// 0
F16_F16_F16
,
// 1
BF16_BF16_BF16
,
// 2
INT8_INT8_INT8
,
// 3
F8_F16_F16
,
// 4
F16_F8_F16
,
// 5
};
#define OP_NAME "grouped_gemm_two_stage"
#define OP_DESC "Grouped GEMM TwoStage"
namespace
{
std
::
vector
<
int
>
argToIntArray
(
char
*
input
)
{
std
::
vector
<
int
>
out
;
std
::
istringstream
in
(
input
);
std
::
string
item
;
while
(
std
::
getline
(
in
,
item
,
','
))
{
out
.
push_back
(
std
::
stoi
(
item
));
}
return
out
;
}
int
profile_grouped_gemm_two_stage
(
int
argc
,
char
*
argv
[])
{
if
(
argc
<
14
)
{
std
::
cout
<<
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
<<
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: fp8@fp6; 5: f16@f8)
\n
"
<<
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
<<
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
<<
"arg4: verification (0: no; 1: yes)
\n
"
<<
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg6: print tensor value (0: no; 1: yes)
\n
"
<<
"arg7: time kernel (0=n0, 1=yes)
\n
"
<<
"arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
"64,64 64,64 128,128)
\n
"
<<
"arg15: kbatch value (default 1)
\n
"
<<
"optional:
\n
"
<<
"arg16: number of warm-up cycles (default 1)
\n
"
<<
"arg17: number of iterations (default 10)
\n
"
<<
std
::
endl
;
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
GemmDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
7
]);
const
auto
Ms
=
argToIntArray
(
argv
[
8
]);
const
auto
Ns
=
argToIntArray
(
argv
[
9
]);
const
auto
Ks
=
argToIntArray
(
argv
[
10
]);
auto
StrideAs
=
argToIntArray
(
argv
[
11
]);
//a: mk b: kn, c: mn: stride a =
auto
StrideBs
=
argToIntArray
(
argv
[
12
]);
auto
StrideCs
=
argToIntArray
(
argv
[
13
]);
const
int
kbatch
=
argc
==
15
?
std
::
stoi
(
argv
[
14
])
:
1
;
const
int
DefaultStrideA
=
Ks
[
0
];
const
int
DefaultStrideB
=
Ns
[
0
];
const
int
DefaultStrideC
=
Ns
[
0
];
for
(
size_t
i
=
0
;
i
<
Ms
.
size
();
++
i
)
{
StrideAs
[
i
]
=
StrideAs
[
i
]
==
-
1
?
DefaultStrideA
:
StrideAs
[
i
];
StrideBs
[
i
]
=
StrideBs
[
i
]
==
-
1
?
DefaultStrideB
:
StrideBs
[
i
];
StrideCs
[
i
]
=
StrideCs
[
i
]
==
-
1
?
DefaultStrideC
:
StrideCs
[
i
];
}
int
n_warmup
=
1
;
int
n_iter
=
10
;
if
(
argc
==
17
)
{
n_warmup
=
std
::
stoi
(
argv
[
16
]);
n_iter
=
std
::
stoi
(
argv
[
17
]);
}
#ifdef CK_ENABLE_FP16
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_two_stage_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
,
kbatch
,
n_warmup
,
n_iter
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_two_stage_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
,
kbatch
,
n_warmup
,
n_iter
);
}
else
{
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
#endif
return
0
;
}
}
// anonymous namespace
REGISTER_PROFILER_OPERATION
(
OP_NAME
,
OP_DESC
,
profile_grouped_gemm_two_stage
);
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment