Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
125a39d1
Commit
125a39d1
authored
May 21, 2024
by
Adam Osewski
Browse files
Draft: update instances.
parent
333176c5
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
208 additions
and
204 deletions
+208
-204
library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_multiple_d.hpp
...tensor_operation_instance/gpu/grouped_gemm_multiple_d.hpp
+45
-42
library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/CMakeLists.txt
...ation_instance/gpu/grouped_gemm_multiple_d/CMakeLists.txt
+3
-3
library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
..._xdl_cshuffle_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
+20
-20
profiler/include/profiler/profile_grouped_gemm_multiple_d_splitk_impl.hpp
.../profiler/profile_grouped_gemm_multiple_d_splitk_impl.hpp
+6
-5
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+134
-134
No files found.
library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_multiple_d.hpp
View file @
125a39d1
...
@@ -31,44 +31,47 @@ void add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_irregu
...
@@ -31,44 +31,47 @@ void add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_nk_mn_irregu
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
PassThrough
>>>&
instances
);
void
add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v1
(
// void
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemm
<
Row
,
// add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v1(
Row
,
// std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
Empty_Tuple
,
// Row,
Row
,
// Empty_Tuple,
F16
,
// Row,
F16
,
// F16,
Empty_Tuple
,
// F16,
F16
,
// Empty_Tuple,
PassThrough
,
// F16,
PassThrough
,
// PassThrough,
PassThrough
>>>&
instances
);
// PassThrough,
// PassThrough>>>& instances);
void
add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v1_interwave
(
// void
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemm
<
Row
,
// add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v1_interwave(
Row
,
// std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
Empty_Tuple
,
// Row,
Row
,
// Empty_Tuple,
F16
,
// Row,
F16
,
// F16,
Empty_Tuple
,
// F16,
F16
,
// Empty_Tuple,
PassThrough
,
// F16,
PassThrough
,
// PassThrough,
PassThrough
>>>&
instances
);
// PassThrough,
// PassThrough>>>& instances);
void
add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v2
(
// void
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedGemm
<
Row
,
// add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v2(
Row
,
// std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
Empty_Tuple
,
// Row,
Row
,
// Empty_Tuple,
F16
,
// Row,
F16
,
// F16,
Empty_Tuple
,
// F16,
F16
,
// Empty_Tuple,
PassThrough
,
// F16,
PassThrough
,
// PassThrough,
PassThrough
>>>&
instances
);
// PassThrough,
// PassThrough>>>& instances);
#endif
#endif
template
<
typename
ALayout
,
template
<
typename
ALayout
,
...
@@ -116,12 +119,12 @@ struct DeviceOperationInstanceFactory<
...
@@ -116,12 +119,12 @@ struct DeviceOperationInstanceFactory<
is_same_v
<
ELayout
,
Row
>
)
is_same_v
<
ELayout
,
Row
>
)
{
{
#if defined(CK_ENABLE_FP16)
#if defined(CK_ENABLE_FP16)
add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v1
(
//
add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v1(
op_ptrs
);
//
op_ptrs);
add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v1_interwave
(
//
add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v1_interwave(
op_ptrs
);
//
op_ptrs);
add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v2
(
//
add_device_grouped_gemm_multi_d_splitk_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instances_pipeline_v2(
op_ptrs
);
//
op_ptrs);
#endif
#endif
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
...
...
library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/CMakeLists.txt
View file @
125a39d1
# ONLY XDL_KERNELS
# ONLY XDL_KERNELS
add_instance_library
(
device_grouped_gemm_multiple_d_instance
add_instance_library
(
device_grouped_gemm_multiple_d_instance
device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instance_pipeline_v1.cpp
#
device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instance_pipeline_v1.cpp
device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instance_pipeline_v1_interwave.cpp
#
device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instance_pipeline_v1_interwave.cpp
device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instance_pipeline_v2.cpp
#
device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_kn_mn_irregular_instance_pipeline_v2.cpp
)
)
library/src/tensor_operation_instance/gpu/grouped_gemm_multiple_d/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
View file @
125a39d1
This diff is collapsed.
Click to expand it.
profiler/include/profiler/profile_grouped_gemm_multiple_d_splitk_impl.hpp
View file @
125a39d1
...
@@ -84,11 +84,12 @@ bool profile_ggemm_multid_splitk(int do_verification,
...
@@ -84,11 +84,12 @@ bool profile_ggemm_multid_splitk(int do_verification,
c_m_n_host_results
.
push_back
(
c_m_n_host_results
.
push_back
(
Tensor
<
CDataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideCs
[
i
],
CLayout
{})));
Tensor
<
CDataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideCs
[
i
],
CLayout
{})));
#if DEBUG_LOG
if
(
ck
::
EnvIsEnabled
(
ENV
(
CK_LOGGING
)))
std
::
cout
<<
"group: "
<<
i
<<
" a_m_k["
<<
i
<<
"]:"
<<
a_m_k
[
i
].
mDesc
<<
", b_k_n["
<<
i
{
<<
"]:"
<<
b_k_n
[
i
].
mDesc
<<
", c_m_n_device_results["
<<
i
std
::
cout
<<
"group: "
<<
i
<<
" a_m_k["
<<
i
<<
"]:"
<<
a_m_k
[
i
].
mDesc
<<
", b_k_n["
<<
"]:"
<<
c_m_n_device_results
[
i
].
mDesc
<<
std
::
endl
;
<<
i
<<
"]:"
<<
b_k_n
[
i
].
mDesc
<<
", c_m_n_device_results["
<<
i
#endif // DEBUG_LOG
<<
"]:"
<<
c_m_n_device_results
[
i
].
mDesc
<<
std
::
endl
;
}
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
...
profiler/src/CMakeLists.txt
View file @
125a39d1
# ckProfiler
# ckProfiler
set
(
PROFILER_SOURCES
set
(
PROFILER_SOURCES
profiler.cpp
profiler.cpp
profile_gemm.cpp
#
profile_gemm.cpp
profile_reduce.cpp
#
profile_reduce.cpp
profile_groupnorm_bwd_data.cpp
#
profile_groupnorm_bwd_data.cpp
profile_groupnorm_fwd.cpp
#
profile_groupnorm_fwd.cpp
profile_layernorm_bwd_data.cpp
#
profile_layernorm_bwd_data.cpp
profile_layernorm_bwd_gamma_beta.cpp
#
profile_layernorm_bwd_gamma_beta.cpp
profile_groupnorm_bwd_gamma_beta.cpp
#
profile_groupnorm_bwd_gamma_beta.cpp
profile_layernorm_fwd.cpp
#
profile_layernorm_fwd.cpp
profile_max_pool3d_fwd.cpp
#
profile_max_pool3d_fwd.cpp
profile_avg_pool3d_bwd.cpp
#
profile_avg_pool3d_bwd.cpp
profile_max_pool3d_bwd.cpp
#
profile_max_pool3d_bwd.cpp
profile_softmax.cpp
#
profile_softmax.cpp
profile_batchnorm_fwd.cpp
#
profile_batchnorm_fwd.cpp
profile_batchnorm_bwd.cpp
#
profile_batchnorm_bwd.cpp
profile_batchnorm_infer.cpp
#
profile_batchnorm_infer.cpp
profile_conv_tensor_rearrange.cpp
#
profile_conv_tensor_rearrange.cpp
profile_transpose.cpp
#
profile_transpose.cpp
profile_permute_scale.cpp
#
profile_permute_scale.cpp
)
)
if
(
GPU_TARGETS MATCHES
"gfx9"
)
#
if(GPU_TARGETS MATCHES "gfx9")
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
#
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
list
(
APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp
)
#
list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
list
(
APPEND PROFILER_SOURCES profile_contraction_scale.cpp
)
#
list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
endif
()
#
endif()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
#
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list
(
APPEND PROFILER_SOURCES profile_gemm_reduce.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_reduce.cpp)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp
)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp
)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_streamk.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_two_stage.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm_two_stage.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_multiple_d_splitk.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_multiple_d_splitk.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp)
endif
()
#
endif()
list
(
APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm.cpp
)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp
)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_splitk.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
list
(
APPEND PROFILER_SOURCES profile_gemm_universal.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
list
(
APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp
)
#
list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp)
list
(
APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp
)
#
list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp)
list
(
APPEND PROFILER_SOURCES profile_conv_bwd_data.cpp
)
#
list(APPEND PROFILER_SOURCES profile_conv_bwd_data.cpp)
list
(
APPEND PROFILER_SOURCES profile_conv_fwd.cpp
)
#
list(APPEND PROFILER_SOURCES profile_conv_fwd.cpp)
endif
()
#
endif()
if
(
GPU_TARGETS MATCHES
"gfx11"
OR GPU_TARGETS MATCHES
"gfx9"
)
#
if(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx9")
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
#
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list
(
APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp
)
#
list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
endif
()
#
endif()
list
(
APPEND PROFILER_SOURCES profile_grouped_conv_fwd.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_conv_bwd_data.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_data.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp)
endif
()
#
endif()
if
(
DL_KERNELS
)
#
if(DL_KERNELS)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp
)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
list
(
APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp
)
#
list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp)
endif
()
#
endif()
set
(
PROFILER_EXECUTABLE ckProfiler
)
set
(
PROFILER_EXECUTABLE ckProfiler
)
...
@@ -79,78 +79,78 @@ add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
...
@@ -79,78 +79,78 @@ add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
target_compile_options
(
${
PROFILER_EXECUTABLE
}
PRIVATE -Wno-global-constructors
)
target_compile_options
(
${
PROFILER_EXECUTABLE
}
PRIVATE -Wno-global-constructors
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE utility getopt::getopt
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE utility getopt::getopt
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_bwd_data_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_bwd_gamma_beta_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_softmax_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_reduce_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batchnorm_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_pool3d_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_avg_pool3d_bwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_max_pool_bwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_image_to_column_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_column_to_image_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_transpose_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_permute_scale_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
if
(
GPU_TARGETS MATCHES
"gfx9"
)
#
if(GPU_TARGETS MATCHES "gfx9")
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
#
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_bilinear_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_scale_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
endif
()
#
endif()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
#
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_add_fastgelu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_fastgelu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_gemm_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_add_relu_gemm_add_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_streamk_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_fastgelu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_relu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_silu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_relu_add_layernorm_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_fixed_nk_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_fastgelu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_multiple_d_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_multiple_d_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_tile_loop_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_tile_loop_instance)
endif
()
#
endif()
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_reduce_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_multiply_add_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_splitk_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_universal_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_multiply_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_reduce_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_bias_add_reduce_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_bias_relu_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv1d_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv1d_bwd_data_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv3d_bwd_data_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_bwd_data_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv1d_bwd_weight_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_weight_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
endif
()
#
endif()
if
(
GPU_TARGETS MATCHES
"gfx9"
OR GPU_TARGETS MATCHES
"gfx11"
)
#
if(GPU_TARGETS MATCHES "gfx9" OR GPU_TARGETS MATCHES "gfx11")
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
#
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_bilinear_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
endif
()
#
endif()
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_data_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_data_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_fwd_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_weight_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
endif
()
#
endif()
if
(
DL_KERNELS
)
#
if(DL_KERNELS)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_multi_d_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv1d_bwd_weight_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_weight_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_weight_instance
)
#
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
endif
()
#
endif()
rocm_install
(
TARGETS
${
PROFILER_EXECUTABLE
}
COMPONENT profiler
)
rocm_install
(
TARGETS
${
PROFILER_EXECUTABLE
}
COMPONENT profiler
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment