Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
b76d3d9f
Unverified
Commit
b76d3d9f
authored
Mar 11, 2024
by
Illia Silin
Committed by
GitHub
Mar 11, 2024
Browse files
Merge branch 'develop' into navi4x_wmma
parents
7d6cea85
9de63596
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
191 additions
and
141 deletions
+191
-141
Jenkinsfile
Jenkinsfile
+2
-2
docs/conf.py
docs/conf.py
+2
-0
docs/wrapper.rst
docs/wrapper.rst
+7
-7
example/01_gemm/CMakeLists.txt
example/01_gemm/CMakeLists.txt
+11
-7
example/01_gemm/run_gemm_example.inc
example/01_gemm/run_gemm_example.inc
+3
-3
include/ck/wrapper/layout.hpp
include/ck/wrapper/layout.hpp
+9
-0
include/ck/wrapper/operations/copy.hpp
include/ck/wrapper/operations/copy.hpp
+3
-0
include/ck/wrapper/operations/gemm.hpp
include/ck/wrapper/operations/gemm.hpp
+6
-0
include/ck/wrapper/tensor.hpp
include/ck/wrapper/tensor.hpp
+9
-0
include/ck/wrapper/traits/blockwise_gemm_xdl_traits.hpp
include/ck/wrapper/traits/blockwise_gemm_xdl_traits.hpp
+3
-0
include/ck/wrapper/utils/kernel_utils.hpp
include/ck/wrapper/utils/kernel_utils.hpp
+3
-0
include/ck/wrapper/utils/layout_utils.hpp
include/ck/wrapper/utils/layout_utils.hpp
+4
-1
include/ck/wrapper/utils/tensor_partition.hpp
include/ck/wrapper/utils/tensor_partition.hpp
+6
-0
include/ck/wrapper/utils/tensor_utils.hpp
include/ck/wrapper/utils/tensor_utils.hpp
+4
-1
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+117
-118
test/grouped_convnd_bwd_weight/CMakeLists.txt
test/grouped_convnd_bwd_weight/CMakeLists.txt
+1
-1
test/wrapper/CMakeLists.txt
test/wrapper/CMakeLists.txt
+1
-1
No files found.
Jenkinsfile
View file @
b76d3d9f
...
...
@@ -496,7 +496,7 @@ def Build_CK(Map conf=[:]){
def
navi_node
=
0
def
mi300_node
=
0
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCm'
,
repo:
'composable_kernel-internal'
)
{
gitStatusWrapper
(
credentialsId:
"${
env.
status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCm'
,
repo:
'composable_kernel-internal'
)
{
try
{
(
retimage
,
image
)
=
getDockerImage
(
conf
)
withDockerContainer
(
image:
image
,
args:
dockerOpts
)
{
...
...
@@ -602,7 +602,7 @@ def process_results(Map conf=[:]){
def
variant
=
env
.
STAGE_NAME
def
retimage
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCm'
,
repo:
'composable_kernel-internal'
)
{
gitStatusWrapper
(
credentialsId:
"${
env.
status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCm'
,
repo:
'composable_kernel-internal'
)
{
try
{
(
retimage
,
image
)
=
getDockerImage
(
conf
)
}
...
...
docs/conf.py
View file @
b76d3d9f
...
...
@@ -45,3 +45,5 @@ for sphinx_var in ROCmDocs.SPHINX_VARS:
extensions
+=
[
'sphinxcontrib.bibtex'
]
bibtex_bibfiles
=
[
'refs.bib'
]
cpp_id_attributes
=
[
"__global__"
,
"__device__"
,
"__host__"
]
docs/wrapper.rst
View file @
b76d3d9f
...
...
@@ -64,31 +64,31 @@ Advanced examples:
Layout
-------------------------------------
.. doxygenstruct::
ck::wrapper::
Layout
.. doxygenstruct:: Layout
-------------------------------------
Layout helpers
-------------------------------------
.. doxygenfile:: layout_utils.hpp
.. doxygenfile::
include/ck/wrapper/utils/
layout_utils.hpp
-------------------------------------
Tensor
-------------------------------------
.. doxygenstruct::
ck::wrapper::
Tensor
.. doxygenstruct:: Tensor
-------------------------------------
Tensor helpers
-------------------------------------
.. doxygenfile:: tensor_utils.hpp
.. doxygenfile::
include/ck/wrapper/utils/
tensor_utils.hpp
.. doxygenfile:: tensor_partition.hpp
.. doxygenfile::
include/ck/wrapper/utils/
tensor_partition.hpp
-------------------------------------
Operations
-------------------------------------
.. doxygenfile:: copy.hpp
.. doxygenfile:: gemm.hpp
.. doxygenfile::
include/ck/wrapper/operations/
copy.hpp
.. doxygenfile::
include/ck/wrapper/operations/
gemm.hpp
example/01_gemm/CMakeLists.txt
View file @
b76d3d9f
...
...
@@ -27,7 +27,8 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
add_example_executable
(
example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
OR GPU_TARGETS MATCHES
"gfx1101"
OR GPU_TARGETS MATCHES
"gfx1102"
OR GPU_TARGETS MATCHES
"gfx1200"
)
if
(
GPU_TARGETS MATCHES
"gfx11"
OR GPU_TARGETS MATCHES
"gfx12"
)
add_custom_target
(
example_gemm_wmma
)
add_example_executable
(
example_gemm_wmma_fp16 gemm_wmma_fp16.cpp
)
add_example_dependencies
(
example_gemm_wmma example_gemm_wmma_fp16
)
...
...
@@ -53,12 +54,6 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
add_example_executable
(
example_gemm_xdl_streamk gemm_xdl_streamk.cpp
)
add_example_executable
(
example_gemm_xdl_fp8 gemm_xdl_fp8.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_fp8
)
add_example_executable
(
example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_fp8_bf8
)
list
(
APPEND gpu_list gfx90a gfx940 gfx941 gfx942 gfx950
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
...
...
@@ -71,3 +66,12 @@ foreach(gpu IN LISTS GPU_TARGETS)
set
(
target 1
)
endif
()
endforeach
()
add_example_executable
(
example_gemm_xdl_fp8 gemm_xdl_fp8.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_fp8
)
add_example_executable
(
example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_fp8_bf8
)
add_example_executable
(
example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp
)
add_example_dependencies
(
example_gemm_xdl example_gemm_xdl_fp16_fp8
)
example/01_gemm/run_gemm_example.inc
View file @
b76d3d9f
...
...
@@ -155,12 +155,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
ck
::
utils
::
FillUniformDistribution
<
BDataType
>
{
-
1.
f
,
1.
f
}(
b_k_n
);
break
;
case
3
:
ck
::
utils
::
Fill
Constant
<
ADataType
>
{
static_cast
<
ADataType
>
(
1.
f
)
}(
a_m_k
);
ck
::
utils
::
Fill
UniformDistributionIntegerValue
<
ADataType
>
{
1.
f
,
1.
f
}(
a_m_k
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
5.
f
,
5.
f
}(
b_k_n
);
break
;
case
4
:
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
5
.
f
,
5
.
f
}(
a_m_k
);
ck
::
utils
::
Fill
Constant
<
BDataType
>
{
static_cast
<
BDataType
>
(
1.
f
)
}(
b_k_n
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
1
.
f
,
1
.
f
}(
a_m_k
);
ck
::
utils
::
Fill
UniformDistributionIntegerValue
<
BDataType
>
{
1.
f
,
1.
f
}(
b_k_n
);
break
;
case
5
:
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
2.
f
,
2.
f
}(
a_m_k
);
...
...
include/ck/wrapper/layout.hpp
View file @
b76d3d9f
...
...
@@ -5,8 +5,11 @@
#include "ck/wrapper/utils/layout_utils.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace
ck
{
namespace
wrapper
{
/// @endcond
/**
* \brief Layout wrapper that performs the tensor descriptor logic.
...
...
@@ -19,6 +22,8 @@ namespace wrapper {
template
<
typename
Shape
,
typename
UnrolledDescriptorType
>
struct
Layout
{
// Disable from doxygen docs generation
/// @cond INTERNAL
private:
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
@@ -246,6 +251,7 @@ struct Layout
using
Descriptor1dType
=
remove_cvref_t
<
decltype
(
MakeMerge1d
(
Shape
{},
UnrolledDescriptorType
{}))
>
;
using
DefaultIdxsTupleType
=
remove_cvref_t
<
decltype
(
GenerateDefaultIdxsTuple
(
Shape
{}))
>
;
/// @endcond
public:
using
LayoutShape
=
Shape
;
...
...
@@ -457,6 +463,8 @@ struct Layout
return
unrolled_descriptor_
;
}
// Disable from doxygen docs generation
/// @cond INTERNAL
private:
// All dimensions are unrolled
UnrolledDescriptorType
unrolled_descriptor_
;
...
...
@@ -469,6 +477,7 @@ struct Layout
// Descriptor1dType lengths: (8)
// MergedNestsDescriptorType lengths: (4, 2)
const
Shape
shape_
;
/// @endcond
};
}
// namespace wrapper
...
...
include/ck/wrapper/operations/copy.hpp
View file @
b76d3d9f
...
...
@@ -12,8 +12,11 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace
ck
{
namespace
wrapper
{
/// @endcond
/**
* \brief Perform optimized copy between two tensors partitions (threadwise copy).
...
...
include/ck/wrapper/operations/gemm.hpp
View file @
b76d3d9f
...
...
@@ -9,9 +9,14 @@
#include "ck/host_utility/device_prop.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace
ck
{
namespace
wrapper
{
/// @endcond
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace
{
namespace
detail
{
/**
...
...
@@ -45,6 +50,7 @@ __device__ constexpr auto GetBlockDescriptor()
}
// namespace detail
}
// namespace
/// @endcond
/**
* \brief Perform blockwise gemm xdl on tensors stored in lds. Result will be
...
...
include/ck/wrapper/tensor.hpp
View file @
b76d3d9f
...
...
@@ -7,9 +7,14 @@
#include "utils/tensor_partition.hpp"
#include "utils/layout_utils.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace
ck
{
namespace
wrapper
{
/// @endcond
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace
{
namespace
detail
{
/**
...
...
@@ -189,6 +194,7 @@ __host__ __device__ constexpr auto GenerateSlicedDescriptor(const Tuple<Ts...>&
}
}
// namespace detail
}
// namespace
/// @endcond
/**
* \brief Tensor wrapper that performs static and dynamic buffer logic.
...
...
@@ -394,6 +400,8 @@ struct Tensor
}
private:
// Disable from doxygen docs generation
/// @cond INTERNAL
using
DynamicBufferType
=
DynamicBuffer
<
BufferAddressSpace
,
ElementType
,
ElementSpaceSize
,
...
...
@@ -428,6 +436,7 @@ struct Tensor
// tensor descriptor (thus all it's transforms) and is linear (1D).
// We store base_offset_ to avoid multiple recalculations.
index_t
base_offset_
;
/// @endcond
};
}
// namespace wrapper
...
...
include/ck/wrapper/traits/blockwise_gemm_xdl_traits.hpp
View file @
b76d3d9f
...
...
@@ -5,8 +5,11 @@
#include "ck/ck.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace
ck
{
namespace
wrapper
{
/// @endcond
/**
* \brief Traits for blockwise gemm xdl.
...
...
include/ck/wrapper/utils/kernel_utils.hpp
View file @
b76d3d9f
...
...
@@ -5,8 +5,11 @@
#include "ck/ck.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace
ck
{
namespace
wrapper
{
/// @endcond
#define __CK_WRAPPER_LAUNCH_BOUNDS__ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
...
...
include/ck/wrapper/utils/layout_utils.hpp
View file @
b76d3d9f
...
...
@@ -17,11 +17,14 @@
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace
ck
{
namespace
wrapper
{
/// @endcond
// Disable from doxygen docs generation
/// @cond
/// @cond
INTERNAL
// forward declaration
template
<
typename
Shape
,
typename
UnrolledDescriptorType
>
struct
Layout
;
...
...
include/ck/wrapper/utils/tensor_partition.hpp
View file @
b76d3d9f
...
...
@@ -9,9 +9,14 @@
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace
ck
{
namespace
wrapper
{
/// @endcond
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace
{
namespace
detail
{
...
...
@@ -236,6 +241,7 @@ __host__ __device__ constexpr auto CalculateThreadMultiIdx(
}
}
// namespace detail
}
// namespace
/// @endcond
/**
* \brief Create local partition for thread (At now only packed partition
...
...
include/ck/wrapper/utils/tensor_utils.hpp
View file @
b76d3d9f
...
...
@@ -13,8 +13,11 @@
#include "ck/utility/amd_address_space.hpp"
#include "ck/utility/multi_index.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace
ck
{
namespace
wrapper
{
/// @endcond
/**
* \brief Memory type, allowed members:
...
...
@@ -27,7 +30,7 @@ namespace wrapper {
using
MemoryTypeEnum
=
AddressSpaceEnum
;
// Disable from doxygen docs generation
/// @cond
/// @cond
INTERNAL
// forward declarations
template
<
typename
Shape
,
typename
UnrolledDescriptorType
>
struct
Layout
;
...
...
profiler/src/CMakeLists.txt
View file @
b76d3d9f
...
...
@@ -2,64 +2,64 @@
set
(
PROFILER_SOURCES
profiler.cpp
profile_gemm.cpp
#
profile_gemm_splitk.cpp
#
profile_gemm_bias_add_reduce.cpp
#
profile_gemm_add_multiply.cpp
#
profile_gemm_multiply_add.cpp
#
profile_gemm_reduce.cpp
#
profile_batched_gemm.cpp
#
profile_batched_gemm_reduce.cpp
#
profile_conv_fwd.cpp
#
profile_conv_fwd_bias_relu.cpp
#
profile_conv_fwd_bias_relu_add.cpp
#
profile_conv_bwd_data.cpp
#
profile_grouped_conv_fwd.cpp
#
profile_grouped_conv_bwd_weight.cpp
#
profile_reduce.cpp
#
profile_groupnorm_bwd_data.cpp
#
profile_groupnorm_fwd.cpp
#
profile_layernorm_bwd_data.cpp
#
profile_layernorm_bwd_gamma_beta.cpp
#
profile_groupnorm_bwd_gamma_beta.cpp
#
profile_layernorm_fwd.cpp
#
profile_max_pool3d_fwd.cpp
#
profile_avg_pool3d_bwd.cpp
#
profile_max_pool3d_bwd.cpp
#
profile_softmax.cpp
#
profile_batchnorm_fwd.cpp
#
profile_batchnorm_bwd.cpp
#
profile_batchnorm_infer.cpp
#
profile_grouped_conv_bwd_data.cpp
#
profile_conv_tensor_rearrange.cpp
#
profile_transpose.cpp
#
profile_permute_scale.cpp
profile_gemm_splitk.cpp
profile_gemm_bias_add_reduce.cpp
profile_gemm_add_multiply.cpp
profile_gemm_multiply_add.cpp
profile_gemm_reduce.cpp
profile_batched_gemm.cpp
profile_batched_gemm_reduce.cpp
profile_conv_fwd.cpp
profile_conv_fwd_bias_relu.cpp
profile_conv_fwd_bias_relu_add.cpp
profile_conv_bwd_data.cpp
profile_grouped_conv_fwd.cpp
profile_grouped_conv_bwd_weight.cpp
profile_reduce.cpp
profile_groupnorm_bwd_data.cpp
profile_groupnorm_fwd.cpp
profile_layernorm_bwd_data.cpp
profile_layernorm_bwd_gamma_beta.cpp
profile_groupnorm_bwd_gamma_beta.cpp
profile_layernorm_fwd.cpp
profile_max_pool3d_fwd.cpp
profile_avg_pool3d_bwd.cpp
profile_max_pool3d_bwd.cpp
profile_softmax.cpp
profile_batchnorm_fwd.cpp
profile_batchnorm_bwd.cpp
profile_batchnorm_infer.cpp
profile_grouped_conv_bwd_data.cpp
profile_conv_tensor_rearrange.cpp
profile_transpose.cpp
profile_permute_scale.cpp
)
#
if(DL_KERNELS)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
#
endif()
#
#
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
#
list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
#
list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
#
list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
#
list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
#
list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
#
list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
#
list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
#
endif()
#
#
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
#
list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
#
list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
#
endif()
if
(
DL_KERNELS
)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_streamk.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_add.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp
)
list
(
APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp
)
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
list
(
APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp
)
list
(
APPEND PROFILER_SOURCES profile_contraction_scale.cpp
)
endif
()
set
(
PROFILER_EXECUTABLE ckProfiler
)
...
...
@@ -68,67 +68,66 @@ target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE utility getopt::getopt
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_instance
)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
#
#if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
#endif()
#
#
#
#if(DL_KERNELS)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
#endif()
#
#if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
#endif()
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_splitk_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_multiply_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_multiply_add_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_bias_add_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv1d_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv1d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv3d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv1d_bwd_weight_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_weight_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_weight_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_bias_relu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_normalization_bwd_gamma_beta_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_softmax_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batchnorm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_pool3d_fwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_avg_pool3d_bwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_max_pool_bwd_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv2d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_bwd_data_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_image_to_column_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_column_to_image_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_transpose_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_permute_scale_instance
)
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_bilinear_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_scale_instance
)
endif
()
if
(
DL_KERNELS
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_multi_d_instance
)
endif
()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_fastgelu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_relu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_silu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_relu_add_layernorm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_bilinear_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_add_add_fastgelu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_streamk_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_fastgelu_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_gemm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_add_relu_gemm_add_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_fixed_nk_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_gemm_fastgelu_instance
)
endif
()
rocm_install
(
TARGETS
${
PROFILER_EXECUTABLE
}
COMPONENT profiler
)
test/grouped_convnd_bwd_weight/CMakeLists.txt
View file @
b76d3d9f
list
(
APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942 gfx950
)
list
(
APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102
)
list
(
APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102
gfx1103
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
...
...
test/wrapper/CMakeLists.txt
View file @
b76d3d9f
...
...
@@ -14,7 +14,7 @@ target_link_libraries(test_wrapper_partition PRIVATE utility)
add_dependencies
(
test_wrapper test_wrapper_partition
)
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR
GPU_TARGETS MATCHES
"gfx940"
OR GPU_TARGETS MATCHES
"gfx941"
OR
GPU_TARGETS MATCHES
"gfx942"
)
GPU_TARGETS MATCHES
"gfx942"
OR GPU_TARGETS MATCHES
"gfx950"
)
add_gtest_executable
(
test_wrapper_gemm test_wrapper_gemm.cpp
)
target_link_libraries
(
test_wrapper_gemm PRIVATE utility
)
add_dependencies
(
test_wrapper test_wrapper_gemm
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment