Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
f7b18c9f
Commit
f7b18c9f
authored
Jan 29, 2024
by
Jakub Piasecki
Browse files
fix old files
parent
015807d8
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
343 additions
and
474 deletions
+343
-474
example/01_gemm/CMakeLists.txt
example/01_gemm/CMakeLists.txt
+24
-54
example/01_gemm/gemm_xdl_fp16_int8.cpp
example/01_gemm/gemm_xdl_fp16_int8.cpp
+0
-41
library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
...include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+1
-28
library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp
...brary/tensor_operation_instance/gpu/gemm_multiply_add.hpp
+0
-50
library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
...ary/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+143
-43
library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt
...r_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt
+1
-3
library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_int8_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
..._shuffle_f16_int8_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+0
-128
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+90
-87
profiler/src/profile_gemm.cpp
profiler/src/profile_gemm.cpp
+63
-9
profiler/src/profile_gemm_multiply_add.cpp
profiler/src/profile_gemm_multiply_add.cpp
+1
-13
test/gemm/CMakeLists.txt
test/gemm/CMakeLists.txt
+20
-18
No files found.
example/01_gemm/CMakeLists.txt
View file @
f7b18c9f
add_custom_target
(
example_gemm_dl
)
add_example_executable
(
example_gemm_dl_fp32 gemm_dl_fp32.cpp
)
add_example_dependencies
(
example_gemm_dl example_gemm_dl_fp32
)
add_example_executable
(
example_gemm_dl_fp16 gemm_dl_fp16.cpp
)
add_example_
dependencies
(
example_gemm_dl
example_gemm_dl_fp16
)
add_example_
executable
(
example_gemm_dl
_int8 gemm_dl_int8.cpp
)
add_example_executable
(
example_gemm_dpp_fp16 gemm_dpp_fp16.cpp
)
add_dependencies
(
example_gemm_dl example_gemm_dl_fp32
)
add_dependencies
(
example_gemm_dl example_gemm_dl_fp16
)
add_dependencies
(
example_gemm_dl example_gemm_dl_int8
)
add_example_executable
(
example_gemm_dl_int8 gemm_dl_int8.cpp
)
add_example_dependencies
(
example_gemm_dl example_gemm_dl_int8
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_gemm_dl_int4 gemm_dl_int4.cpp
)
add_
example_
dependencies
(
example_gemm_dl example_gemm_dl_int4
)
add_example_executable
(
example_gemm_dl_int4 gemm_dl_int4.cpp
)
add_dependencies
(
example_gemm_dl example_gemm_dl_int4
)
endif
(
USE_BITINT_EXTENSION_INT4
)
add_custom_target
(
example_gemm_xdl
)
add_example_executable
(
example_gemm_xdl_fp16 gemm_xdl_fp16.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_fp16
)
add_example_executable
(
example_gemm_xdl_fp16 gemm_xdl_fp16.cpp
)
add_example_executable
(
example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_wavelet_fp16
)
add_example_executable
(
example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
OR GPU_TARGETS MATCHES
"gfx1101"
OR GPU_TARGETS MATCHES
"gfx1102"
)
add_custom_target
(
example_gemm_wmma
)
add_example_executable
(
example_gemm_wmma_fp16 gemm_wmma_fp16.cpp
)
add_example_dependencies
(
example_gemm_wmma example_gemm_wmma_fp16
)
endif
()
add_example_executable
(
example_gemm_xdl_bf16 gemm_xdl_bf16.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_bf16
)
add_example_executable
(
example_gemm_xdl_bf16_rtn gemm_xdl_bf16_rtn.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_bf16_rtn
)
add_example_executable
(
example_gemm_xdl_int8 gemm_xdl_int8.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_int8
)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_fp16
)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_bf16
)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_int8
)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_wavelet_fp16
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_gemm_xdl_int4 gemm_xdl_int4.cpp
)
add_
example_
dependencies
(
example_gemm_xdl example_gemm_xdl_int4
)
add_example_executable
(
example_gemm_xdl_int4 gemm_xdl_int4.cpp
)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_int4
)
endif
(
USE_BITINT_EXTENSION_INT4
)
# FIXME: re-enable this example as test when SWDEV-335738 is fixed
add_example_executable
(
example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp
)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
add_example_executable_no_testing
(
example_gemm_xdl_fp64 gemm_xdl_fp64.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_fp64
)
add_example_executable
(
example_gemm_xdl_streamk gemm_xdl_streamk.cpp
)
add_
example_executable
(
example_gemm_xdl_fp8 gemm_xdl_fp8.cpp
)
add_
example_
dependencies
(
example_gemm_xdl example_gemm_xdl_fp
8
)
add_
dependencies
(
example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16
)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_fp
64
)
add_example_executable
(
example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_fp8_bf8
)
list
(
APPEND gpu_list gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_example_executable
(
example_gemm_xdl_lds_direct_load_fp32 gemm_xdl_lds_direct_load_fp32.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_lds_direct_load_fp32
)
add_example_executable
(
example_gemm_xdl_lds_direct_load_fp16 gemm_xdl_lds_direct_load_fp16.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_lds_direct_load_fp16
)
set
(
target 1
)
endif
()
endforeach
()
add_example_executable
(
example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_fp16_fp8
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
add_custom_target
(
example_gemm_wmma
)
add_example_executable
(
example_gemm_wmma_fp16 gemm_wmma_fp16.cpp
)
add_dependencies
(
example_gemm_wmma example_gemm_wmma_fp16
)
endif
()
add_example_executable
(
example_gemm_xdl_fp16_int8 gemm_xdl_fp16_int8.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_fp16_int8
)
example/01_gemm/gemm_xdl_fp16_int8.cpp
deleted
100644 → 0
View file @
015807d8
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
int8_t
;
using
CDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
ck
::
half_t
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
CLayout
=
Row
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CElementOp
=
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
LoopSched
=
ck
::
make_default_loop_scheduler
();
static
constexpr
auto
PipelineVer
=
ck
::
PipelineVersion
::
v1
;
using
ComputeType
=
ck
::
half_t
;
// clang-format off
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemm_Xdl_CShuffle
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Loop| Pipeline| ComputeType|
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Scheduler| Version| |
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
BElementOp
,
CElementOp
,
GemmDefault
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
LoopSched
,
PipelineVer
,
ComputeType
>
;
// clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
#include "run_gemm_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_gemm_example
(
argc
,
argv
);
}
library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
View file @
f7b18c9f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -367,17 +367,6 @@ void add_device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instances(
DeviceGemm
<
Row
,
Col
,
Row
,
F16
,
F8
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_INT8)
void
add_device_gemm_xdl_c_shuffle_f16_int8_f16_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Row
,
Row
,
F16
,
I8
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
void
add_device_gemm_xdl_c_shuffle_f16_int8_f16_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Col
,
Row
,
F16
,
I8
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
#endif
template
<
typename
ALayout
,
typename
BLayout
,
...
...
@@ -623,22 +612,6 @@ struct DeviceOperationInstanceFactory<
add_device_gemm_xdl_c_shuffle_f16_f8_f16_mk_nk_mn_instances
(
op_ptrs
);
}
}
#endif
#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_INT8)
else
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
int8_t
>
&&
is_same_v
<
CDataType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_c_shuffle_f16_int8_f16_mk_kn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
CLayout
,
Row
>
)
{
add_device_gemm_xdl_c_shuffle_f16_int8_f16_mk_nk_mn_instances
(
op_ptrs
);
}
}
#endif
return
op_ptrs
;
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp
View file @
f7b18c9f
...
...
@@ -73,34 +73,6 @@ void add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_m
MultiplyAdd
>>>&
);
#endif
#if defined CK_ENABLE_INT8
void
add_device_gemm_multiply_add_xdl_c_shuffle_f16_int8_f16_f16_f16_mk_kn_mn_mn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Row
,
Row_Row_Tuple
,
Row
,
F16
,
I8
,
F16_F16_Tuple
,
F16
,
PassThrough
,
PassThrough
,
MultiplyAdd
>>>&
);
void
add_device_gemm_multiply_add_xdl_c_shuffle_f16_int8_f16_f16_f16_mk_nk_mn_mn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Col
,
Row_Row_Tuple
,
Row
,
F16
,
I8
,
F16_F16_Tuple
,
F16
,
PassThrough
,
PassThrough
,
MultiplyAdd
>>>&
);
#endif
// GEMM + Multiply + Add
template
<
typename
ALayout
,
typename
BLayout
,
...
...
@@ -183,28 +155,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
}
#endif
#if defined CK_ENABLE_INT8
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
int8_t
>
&&
is_same_v
<
D0DataType
,
half_t
>
&&
is_same_v
<
D1DataType
,
half_t
>
&&
is_same_v
<
EDataType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
D0Layout
,
Row
>
&&
is_same_v
<
D1Layout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_multiply_add_xdl_c_shuffle_f16_int8_f16_f16_f16_mk_kn_mn_mn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
D0Layout
,
Row
>
&&
is_same_v
<
D1Layout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_multiply_add_xdl_c_shuffle_f16_int8_f16_f16_f16_mk_nk_mn_mn_mn_instances
(
op_ptrs
);
}
}
#endif
return
op_ptrs
;
}
};
...
...
library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
View file @
f7b18c9f
add_instance_library
(
device_gemm_instance
device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
)
set
(
GEMM_INSTANCES
)
list
(
APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
)
list
(
APPEND GEMM_INSTANCES
device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_km_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_lds_direct_load_f32_f32_f32_mk_nk_mn_instance.cpp
device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
)
list
(
APPEND GEMM_INSTANCES
device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp
device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp
device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp
device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp
device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp
device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp
device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp
device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp
device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_lds_direct_load_f16_f16_f16_mk_nk_mn_instance.cpp
device_gemm_xdl_f16_f16_f16/km_kn_mn_add_instance.cpp
device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp
device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp
device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v2_instance.cpp
device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/km_nk_mn_add_instance.cpp
device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp
device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp
device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v2_instance.cpp
device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_kn_mn_add_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v2_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_nk_mn_add_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp
device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
)
list
(
APPEND GEMM_INSTANCES
device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp
device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp
device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp
device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp
device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
)
list
(
APPEND GEMM_INSTANCES
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
)
list
(
APPEND GEMM_INSTANCES
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_default_instance.cpp
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_padded_instance.cpp
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp
)
add_instance_library
(
device_gemm_instance
${
GEMM_INSTANCES
}
)
set
(
ENABLE_PIPELINE_V2_OPT
)
if
(
ENABLE_PIPELINE_V2_OPT
)
set
(
WAVES_PER_EU_DEFS
CK_USE_WAVES_PER_EU=1
CK_MIN_WAVES_PER_EU=1
CK_MAX_WAVES_PER_EU=1
)
set
(
IGLP_OPT_DEFS
CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT=1
)
# TODO: The "-vectorize-slp=false" LLVM option is a workaround to prevent inefficient instruction scheduling
# caused by the SLP Vectorizer. Remove this option after fix the SLP Vectorizer issue.
# layout=NT
set_source_files_properties
(
device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
COMPILE_OPTIONS
";-mllvm;-vectorize-slp=false"
COMPILE_DEFINITIONS
"
${
WAVES_PER_EU_DEFS
}
;
${
IGLP_OPT_DEFS
}
"
)
# layout=NN
set_source_files_properties
(
device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
COMPILE_OPTIONS
";-mllvm;-vectorize-slp=false"
COMPILE_DEFINITIONS
"
${
WAVES_PER_EU_DEFS
}
;
${
IGLP_OPT_DEFS
}
"
)
# layout=TT
set_source_files_properties
(
device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
COMPILE_OPTIONS
";;"
COMPILE_DEFINITIONS
"
${
WAVES_PER_EU_DEFS
}
;
${
IGLP_OPT_DEFS
}
"
)
# layout=TN
set_source_files_properties
(
device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
COMPILE_OPTIONS
";;"
COMPILE_DEFINITIONS
"
${
WAVES_PER_EU_DEFS
}
;
${
IGLP_OPT_DEFS
}
"
)
endif
(
ENABLE_PIPELINE_V2_OPT
)
library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt
View file @
f7b18c9f
...
...
@@ -2,7 +2,5 @@ set(GEMM_MULTIPLY_ADD_INSTANCES)
list
(
APPEND GEMM_MULTIPLY_ADD_INSTANCES device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp
device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp
device_gemm_multiply_add_xdl_c_shuffle_f16_int8_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
device_gemm_multiply_add_xdl_c_shuffle_f16_int8_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
)
device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp
)
add_instance_library
(
device_gemm_multiply_add_instance
${
GEMM_MULTIPLY_ADD_INSTANCES
}
)
library/src/tensor_operation_instance/gpu/gemm_multiply_add/device_gemm_multiply_add_xdl_c_shuffle_f16_int8_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
deleted
100644 → 0
View file @
015807d8
This diff is collapsed.
Click to expand it.
profiler/src/CMakeLists.txt
View file @
f7b18c9f
# ckProfiler
set
(
PROFILER_SOURCES
profiler.cpp
# profile_gemm.cpp
# profile_gemm_splitk.cpp
# profile_gemm_bias_add_reduce.cpp
# profile_gemm_add_multiply.cpp
# profile_gemm_multiply_add.cpp
# profile_gemm_reduce.cpp
# profile_batched_gemm.cpp
# profile_batched_gemm_reduce.cpp
# profile_conv_fwd.cpp
# profile_conv_fwd_bias_relu.cpp
# profile_conv_fwd_bias_relu_add.cpp
# profile_conv_bwd_data.cpp
# profile_grouped_conv_fwd.cpp
# profile_grouped_conv_bwd_weight.cpp
# profile_reduce.cpp
# profile_groupnorm_bwd_data.cpp
# profile_groupnorm_fwd.cpp
# profile_layernorm_bwd_data.cpp
# profile_layernorm_fwd.cpp
# profile_max_pool3d_fwd.cpp
# profile_avg_pool3d_bwd.cpp
# profile_max_pool3d_bwd.cpp
# profile_softmax.cpp
# profile_batchnorm_fwd.cpp
# profile_batchnorm_bwd.cpp
# profile_batchnorm_infer.cpp
# profile_grouped_conv_bwd_data.cpp
# profile_conv_tensor_rearrange.cpp
# profile_transpose.cpp
profile_gemm.cpp
profile_gemm_splitk.cpp
profile_gemm_bias_add_reduce.cpp
profile_gemm_add_multiply.cpp
profile_gemm_multiply_add.cpp
profile_gemm_reduce.cpp
profile_batched_gemm.cpp
profile_batched_gemm_reduce.cpp
profile_conv_fwd.cpp
profile_conv_fwd_bias_relu.cpp
profile_conv_fwd_bias_relu_add.cpp
profile_conv_bwd_data.cpp
profile_grouped_conv_fwd.cpp
profile_grouped_conv_bwd_weight.cpp
profile_reduce.cpp
profile_groupnorm_bwd_data.cpp
profile_groupnorm_fwd.cpp
profile_layernorm_bwd_data.cpp
profile_layernorm_bwd_gamma_beta.cpp
profile_groupnorm_bwd_gamma_beta.cpp
profile_layernorm_fwd.cpp
profile_max_pool3d_fwd.cpp
profile_avg_pool3d_bwd.cpp
profile_max_pool3d_bwd.cpp
profile_softmax.cpp
profile_batchnorm_fwd.cpp
profile_batchnorm_bwd.cpp
profile_batchnorm_infer.cpp
profile_grouped_conv_bwd_data.cpp
profile_conv_tensor_rearrange.cpp
profile_transpose.cpp
)
if
(
DL_KERNELS
)
...
...
@@ -37,24 +39,24 @@ if(DL_KERNELS)
endif
()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
# list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
#list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_streamk.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_add.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_add.cpp
)
# list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
# list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
# list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
# list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp
)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
#
list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
#
list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
list
(
APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp
)
list
(
APPEND PROFILER_SOURCES profile_contraction_scale.cpp
)
endif
()
set
(
PROFILER_EXECUTABLE ckProfiler
)
...
...
@@ -63,65 +65,66 @@ add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
target_compile_options
(
${
PROFILER_EXECUTABLE
}
PRIVATE -Wno-global-constructors
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE utility getopt::getopt
)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_splitk_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_multiply_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_multiply_add_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_bias_add_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv1d_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv1d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv3d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv1d_bwd_weight_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_weight_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_weight_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_bias_relu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_bwd_gamma_beta_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_softmax_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batchnorm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_pool3d_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_avg_pool3d_bwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_max_pool_bwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_image_to_column_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_column_to_image_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_transpose_instance
)
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_bilinear_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_scale_instance
)
endif
()
if
(
DL_KERNELS
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_multi_d_instance
)
endif
()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_fastgelu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_relu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_silu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_instance
)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_relu_add_layernorm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_bilinear_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_add_fastgelu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_streamk_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_fastgelu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_gemm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_add_relu_gemm_add_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_fastgelu_instance
)
endif
()
rocm_install
(
TARGETS
${
PROFILER_EXECUTABLE
}
COMPONENT profiler
)
profiler/src/profile_gemm.cpp
View file @
f7b18c9f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
...
...
@@ -23,6 +23,7 @@ enum struct GemmDataType
F16_F16_F16
,
// 1
BF16_BF16_BF16
,
// 2
INT8_INT8_INT8
,
// 3
F8_F8_F8
,
// 4
};
#define OP_NAME "gemm"
...
...
@@ -31,7 +32,7 @@ enum struct GemmDataType
static
void
print_helper_msg
()
{
std
::
cout
<<
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
<<
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)
\n
"
<<
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8
; 4: fp8
)
\n
"
<<
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
<<
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
<<
" 2: A[k, m] * B[k, n] = C[m, n];
\n
"
...
...
@@ -41,12 +42,15 @@ static void print_helper_msg()
<<
"arg6: print tensor value (0: no; 1: yes)
\n
"
<<
"arg7: time kernel (0: no, 1: yes)
\n
"
<<
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
<<
"optional:
\n
"
<<
"arg14: number of warm-up cycles (default 1)
\n
"
<<
"arg15: number of iterations (default 10)
\n
"
<<
std
::
endl
;
}
int
profile_gemm
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
14
)
if
(
argc
!=
14
&&
argc
!=
16
)
{
print_helper_msg
();
exit
(
1
);
...
...
@@ -67,11 +71,25 @@ int profile_gemm(int argc, char* argv[])
const
int
StrideB
=
std
::
stoi
(
argv
[
12
]);
const
int
StrideC
=
std
::
stoi
(
argv
[
13
]);
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
BF16
=
ck
::
bhalf_t
;
int
n_warmup
=
1
;
int
n_iter
=
10
;
if
(
argc
==
16
)
{
n_warmup
=
std
::
stoi
(
argv
[
14
]);
n_iter
=
std
::
stoi
(
argv
[
15
]);
}
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
#ifdef CK_ENABLE_BF16
using
BF16
=
ck
::
bhalf_t
;
#endif
#ifdef CK_ENABLE_INT8
using
INT8
=
int8_t
;
using
INT32
=
int32_t
;
#endif
#ifdef CK_ENABLE_FP8
using
F8
=
ck
::
f8_t
;
#endif
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
...
...
@@ -112,12 +130,23 @@ int profile_gemm(int argc, char* argv[])
K
,
(
StrideA
<
0
)
?
DefaultStrideA
:
StrideA
,
(
StrideB
<
0
)
?
DefaultStrideB
:
StrideB
,
(
StrideC
<
0
)
?
DefaultStrideC
:
StrideC
);
(
StrideC
<
0
)
?
DefaultStrideC
:
StrideC
,
n_warmup
,
n_iter
);
return
pass
?
0
:
1
;
};
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
if
(
data_type
!=
GemmDataType
::
F32_F32_F32
&&
data_type
!=
GemmDataType
::
F16_F16_F16
&&
data_type
!=
GemmDataType
::
BF16_BF16_BF16
&&
data_type
!=
GemmDataType
::
INT8_INT8_INT8
&&
data_type
!=
GemmDataType
::
F8_F8_F8
)
{
// dummy clause before the else clauses for different data types
std
::
cout
<<
"Gemm: this data_type is not implemented"
<<
std
::
endl
;
return
1
;
}
#ifdef CK_ENABLE_FP32
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
Row
{},
Row
{},
Row
{},
F32
{},
F32
{},
F32
{},
F32
{});
}
...
...
@@ -133,6 +162,8 @@ int profile_gemm(int argc, char* argv[])
{
return
profile
(
Col
{},
Col
{},
Row
{},
F32
{},
F32
{},
F32
{},
F32
{});
}
#endif
#ifdef CK_ENABLE_FP16
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
Row
{},
Row
{},
Row
{},
F16
{},
F16
{},
F32
{},
F16
{});
...
...
@@ -149,6 +180,8 @@ int profile_gemm(int argc, char* argv[])
{
return
profile
(
Col
{},
Col
{},
Row
{},
F16
{},
F16
{},
F32
{},
F16
{});
}
#endif
#ifdef CK_ENABLE_BF16
else
if
(
data_type
==
GemmDataType
::
BF16_BF16_BF16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
Row
{},
Row
{},
Row
{},
BF16
{},
BF16
{},
F32
{},
BF16
{});
...
...
@@ -165,6 +198,8 @@ int profile_gemm(int argc, char* argv[])
{
return
profile
(
Col
{},
Col
{},
Row
{},
BF16
{},
BF16
{},
F32
{},
BF16
{});
}
#endif
#ifdef CK_ENABLE_INT8
else
if
(
data_type
==
GemmDataType
::
INT8_INT8_INT8
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
Row
{},
Row
{},
Row
{},
INT8
{},
INT8
{},
INT32
{},
INT8
{});
...
...
@@ -181,9 +216,28 @@ int profile_gemm(int argc, char* argv[])
{
return
profile
(
Col
{},
Col
{},
Row
{},
INT8
{},
INT8
{},
INT32
{},
INT8
{});
}
#endif
#ifdef CK_ENABLE_FP8
else
if
(
data_type
==
GemmDataType
::
F8_F8_F8
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
return
profile
(
Row
{},
Row
{},
Row
{},
F8
{},
F8
{},
F32
{},
F8
{});
}
else
if
(
data_type
==
GemmDataType
::
F8_F8_F8
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
return
profile
(
Row
{},
Col
{},
Row
{},
F8
{},
F8
{},
F32
{},
F8
{});
}
else
if
(
data_type
==
GemmDataType
::
F8_F8_F8
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
return
profile
(
Col
{},
Row
{},
Row
{},
F8
{},
F8
{},
F32
{},
F8
{});
}
else
if
(
data_type
==
GemmDataType
::
F8_F8_F8
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
return
profile
(
Col
{},
Col
{},
Row
{},
F8
{},
F8
{},
F32
{},
F8
{});
}
#endif
else
{
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
std
::
cout
<<
"
Gemm:
this data_type & layout is not implemented"
<<
std
::
endl
;
return
1
;
}
...
...
profiler/src/profile_gemm_multiply_add.cpp
View file @
f7b18c9f
...
...
@@ -24,14 +24,13 @@ int profile_gemm_multiply_add(int argc, char* argv[])
{
F16_F16_F16_F16_F16
,
// 0
F16_F8_F32_F32_F16
,
// 1
F16_INT8_F16_F16_F16
,
// 2
};
if
(
argc
!=
16
)
{
// clang-format off
printf
(
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
);
printf
(
"arg2: data type (0: fp16; 1: fp16Afp8B
; 2: fp16Aint8B
)
\n
"
);
printf
(
"arg2: data type (0: fp16; 1: fp16Afp8B)
\n
"
);
printf
(
"arg3: matrix layout (0: E[m, n] = Multiply_Add((A[m, k] * B[k, n]) x D1[m, n] + D0[m, n]);
\n
"
);
printf
(
" 1: E[m, n] = Multiply_Add((A[m, k] * B[n, k]) x D1[m, n] + D0[m, n]);
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
...
...
@@ -60,7 +59,6 @@ int profile_gemm_multiply_add(int argc, char* argv[])
const
int
StrideD1
=
std
::
stoi
(
argv
[
14
]);
const
int
StrideE
=
std
::
stoi
(
argv
[
15
]);
using
INT8
=
int8_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
#if defined CK_ENABLE_FP8
...
...
@@ -136,16 +134,6 @@ int profile_gemm_multiply_add(int argc, char* argv[])
{
return
profile
(
F16
{},
F16
{},
F32
{},
F16
{},
F16
{},
F16
{},
Row
{},
Col
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
MatrixDataType
::
F16_INT8_F16_F16_F16
&&
layout
==
MatrixLayout
::
MK_KN_MN_MN_MN
)
{
return
profile
(
F16
{},
INT8
{},
F16
{},
F16
{},
F16
{},
F16
{},
Row
{},
Row
{},
Row
{},
Row
{},
Row
{});
}
else
if
(
data_type
==
MatrixDataType
::
F16_INT8_F16_F16_F16
&&
layout
==
MatrixLayout
::
MK_NK_MN_MN_MN
)
{
return
profile
(
F16
{},
INT8
{},
F16
{},
F16
{},
F16
{},
F16
{},
Row
{},
Col
{},
Row
{},
Row
{},
Row
{});
}
#if defined CK_ENABLE_FP8
else
if
(
data_type
==
MatrixDataType
::
F16_F8_F32_F32_F16
&&
layout
==
MatrixLayout
::
MK_KN_MN_MN_MN
)
...
...
test/gemm/CMakeLists.txt
View file @
f7b18c9f
add_test_executable
(
test_gemm_fp32 gemm_fp32.cpp
)
target_link_libraries
(
test_gemm_fp32 PRIVATE utility
)
target_link_libraries
(
test_gemm_fp32 PRIVATE device_gemm_instance
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_gemm_fp32 PRIVATE
utility
device_gemm_instance
)
endif
()
add_test_executable
(
test_gemm_fp16 gemm_fp16.cpp
)
target_link_libraries
(
test_gemm_fp16 PRIVATE utility
)
target_link_libraries
(
test_gemm_fp16 PRIVATE device_gemm_instance
)
add_test_executable
(
test_gemm_bf16 gemm_bf16.cpp
)
target_link_libraries
(
test_gemm_bf16 PRIVATE utility
)
target_link_libraries
(
test_gemm_bf16 PRIVATE device_gemm_instance
)
add_test_executable
(
test_gemm_int8 gemm_int8.cpp
)
target_link_libraries
(
test_gemm_int8 PRIVATE utility
)
target_link_libraries
(
test_gemm_int8 PRIVATE device_gemm_instance
)
add_library
(
gemm_standalone_xdl_fp16_instances STATIC
if
(
result EQUAL 0
)
target_link_libraries
(
test_gemm_fp16 PRIVATE utility device_gemm_instance
)
add_library
(
gemm_standalone_xdl_fp16_instances STATIC
instance/gemm_f16_nn_instance.cpp
instance/gemm_f16_nt_instance.cpp
instance/gemm_f16_tn_instance.cpp
instance/gemm_wavelet_f16_tn_instance.cpp
instance/gemm_f16_tt_instance.cpp
)
)
endif
()
add_test_executable
(
test_gemm_standalone_xdl_fp16 gemm_standalone_xdl_fp16.cpp
)
target_link_libraries
(
test_gemm_standalone_xdl_fp16 PRIVATE gemm_standalone_xdl_fp16_instances utility
)
target_include_directories
(
test_gemm_standalone_xdl_fp16 PRIVATE instance/
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_gemm_standalone_xdl_fp16 PRIVATE gemm_standalone_xdl_fp16_instances utility
)
target_include_directories
(
test_gemm_standalone_xdl_fp16 PRIVATE instance/
)
endif
()
add_test_executable
(
test_gemm_bf16 gemm_bf16.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_gemm_bf16 PRIVATE utility device_gemm_instance
)
endif
()
add_test_executable
(
test_gemm_int8 gemm_int8.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_gemm_int8 PRIVATE utility device_gemm_instance
)
endif
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment