Unverified Commit b76d3d9f authored by Illia Silin's avatar Illia Silin Committed by GitHub
Browse files

Merge branch 'develop' into navi4x_wmma

parents 7d6cea85 9de63596
......@@ -496,7 +496,7 @@ def Build_CK(Map conf=[:]){
def navi_node = 0
def mi300_node = 0
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel-internal') {
gitStatusWrapper(credentialsId: "${env.status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel-internal') {
try {
(retimage, image) = getDockerImage(conf)
withDockerContainer(image: image, args: dockerOpts) {
......@@ -602,7 +602,7 @@ def process_results(Map conf=[:]){
def variant = env.STAGE_NAME
def retimage
gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel-internal') {
gitStatusWrapper(credentialsId: "${env.status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel-internal') {
try {
(retimage, image) = getDockerImage(conf)
}
......
......@@ -45,3 +45,5 @@ for sphinx_var in ROCmDocs.SPHINX_VARS:
extensions += ['sphinxcontrib.bibtex']
bibtex_bibfiles = ['refs.bib']
cpp_id_attributes = ["__global__", "__device__", "__host__"]
......@@ -64,31 +64,31 @@ Advanced examples:
Layout
-------------------------------------
.. doxygenstruct:: ck::wrapper::Layout
.. doxygenstruct:: Layout
-------------------------------------
Layout helpers
-------------------------------------
.. doxygenfile:: layout_utils.hpp
.. doxygenfile:: include/ck/wrapper/utils/layout_utils.hpp
-------------------------------------
Tensor
-------------------------------------
.. doxygenstruct:: ck::wrapper::Tensor
.. doxygenstruct:: Tensor
-------------------------------------
Tensor helpers
-------------------------------------
.. doxygenfile:: tensor_utils.hpp
.. doxygenfile:: include/ck/wrapper/utils/tensor_utils.hpp
.. doxygenfile:: tensor_partition.hpp
.. doxygenfile:: include/ck/wrapper/utils/tensor_partition.hpp
-------------------------------------
Operations
-------------------------------------
.. doxygenfile:: copy.hpp
.. doxygenfile:: gemm.hpp
.. doxygenfile:: include/ck/wrapper/operations/copy.hpp
.. doxygenfile:: include/ck/wrapper/operations/gemm.hpp
......@@ -27,7 +27,8 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16)
if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102" OR GPU_TARGETS MATCHES "gfx1200")
if(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
add_custom_target(example_gemm_wmma)
add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp)
add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
......@@ -53,12 +54,6 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
add_example_executable(example_gemm_xdl_streamk gemm_xdl_streamk.cpp)
add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8)
list(APPEND gpu_list gfx90a gfx940 gfx941 gfx942 gfx950)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
......@@ -71,3 +66,12 @@ foreach(gpu IN LISTS GPU_TARGETS)
set(target 1)
endif()
endforeach()
add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8)
add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8)
add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8)
......@@ -155,12 +155,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
break;
case 3:
ck::utils::FillConstant<ADataType>{static_cast<ADataType>(1.f)}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<ADataType>{1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
break;
case 4:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
ck::utils::FillConstant<BDataType>{static_cast<BDataType>(1.f)}(b_k_n);
ck::utils::FillUniformDistributionIntegerValue<ADataType>{1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{1.f, 1.f}(b_k_n);
break;
case 5:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-2.f, 2.f}(a_m_k);
......
......@@ -5,8 +5,11 @@
#include "ck/wrapper/utils/layout_utils.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck {
namespace wrapper {
/// @endcond
/**
* \brief Layout wrapper that performs the tensor descriptor logic.
......@@ -19,6 +22,8 @@ namespace wrapper {
template <typename Shape, typename UnrolledDescriptorType>
struct Layout
{
// Disable from doxygen docs generation
/// @cond INTERNAL
private:
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
......@@ -246,6 +251,7 @@ struct Layout
using Descriptor1dType =
remove_cvref_t<decltype(MakeMerge1d(Shape{}, UnrolledDescriptorType{}))>;
using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>;
/// @endcond
public:
using LayoutShape = Shape;
......@@ -457,6 +463,8 @@ struct Layout
return unrolled_descriptor_;
}
// Disable from doxygen docs generation
/// @cond INTERNAL
private:
// All dimensions are unrolled
UnrolledDescriptorType unrolled_descriptor_;
......@@ -469,6 +477,7 @@ struct Layout
// Descriptor1dType lengths: (8)
// MergedNestsDescriptorType lengths: (4, 2)
const Shape shape_;
/// @endcond
};
} // namespace wrapper
......
......@@ -12,8 +12,11 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck {
namespace wrapper {
/// @endcond
/**
* \brief Perform optimized copy between two tensors partitions (threadwise copy).
......
......@@ -9,9 +9,14 @@
#include "ck/host_utility/device_prop.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck {
namespace wrapper {
/// @endcond
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace {
namespace detail {
/**
......@@ -45,6 +50,7 @@ __device__ constexpr auto GetBlockDescriptor()
} // namespace detail
} // namespace
/// @endcond
/**
* \brief Perform blockwise gemm xdl on tensors stored in lds. Result will be
......
......@@ -7,9 +7,14 @@
#include "utils/tensor_partition.hpp"
#include "utils/layout_utils.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck {
namespace wrapper {
/// @endcond
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace {
namespace detail {
/**
......@@ -189,6 +194,7 @@ __host__ __device__ constexpr auto GenerateSlicedDescriptor(const Tuple<Ts...>&
}
} // namespace detail
} // namespace
/// @endcond
/**
* \brief Tensor wrapper that performs static and dynamic buffer logic.
......@@ -394,6 +400,8 @@ struct Tensor
}
private:
// Disable from doxygen docs generation
/// @cond INTERNAL
using DynamicBufferType = DynamicBuffer<BufferAddressSpace,
ElementType,
ElementSpaceSize,
......@@ -428,6 +436,7 @@ struct Tensor
// tensor descriptor (thus all it's transforms) and is linear (1D).
// We store base_offset_ to avoid multiple recalculations.
index_t base_offset_;
/// @endcond
};
} // namespace wrapper
......
......@@ -5,8 +5,11 @@
#include "ck/ck.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck {
namespace wrapper {
/// @endcond
/**
* \brief Traits for blockwise gemm xdl.
......
......@@ -5,8 +5,11 @@
#include "ck/ck.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck {
namespace wrapper {
/// @endcond
#define __CK_WRAPPER_LAUNCH_BOUNDS__ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
......
......@@ -17,11 +17,14 @@
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck {
namespace wrapper {
/// @endcond
// Disable from doxygen docs generation
/// @cond
/// @cond INTERNAL
// forward declaration
template <typename Shape, typename UnrolledDescriptorType>
struct Layout;
......
......@@ -9,9 +9,14 @@
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck {
namespace wrapper {
/// @endcond
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace {
namespace detail {
......@@ -236,6 +241,7 @@ __host__ __device__ constexpr auto CalculateThreadMultiIdx(
}
} // namespace detail
} // namespace
/// @endcond
/**
* \brief Create local partition for thread (At now only packed partition
......
......@@ -13,8 +13,11 @@
#include "ck/utility/amd_address_space.hpp"
#include "ck/utility/multi_index.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck {
namespace wrapper {
/// @endcond
/**
* \brief Memory type, allowed members:
......@@ -27,7 +30,7 @@ namespace wrapper {
using MemoryTypeEnum = AddressSpaceEnum;
// Disable from doxygen docs generation
/// @cond
/// @cond INTERNAL
// forward declarations
template <typename Shape, typename UnrolledDescriptorType>
struct Layout;
......
......@@ -2,64 +2,64 @@
set(PROFILER_SOURCES
profiler.cpp
profile_gemm.cpp
#profile_gemm_splitk.cpp
#profile_gemm_bias_add_reduce.cpp
#profile_gemm_add_multiply.cpp
#profile_gemm_multiply_add.cpp
#profile_gemm_reduce.cpp
#profile_batched_gemm.cpp
#profile_batched_gemm_reduce.cpp
#profile_conv_fwd.cpp
#profile_conv_fwd_bias_relu.cpp
#profile_conv_fwd_bias_relu_add.cpp
#profile_conv_bwd_data.cpp
#profile_grouped_conv_fwd.cpp
#profile_grouped_conv_bwd_weight.cpp
#profile_reduce.cpp
#profile_groupnorm_bwd_data.cpp
#profile_groupnorm_fwd.cpp
#profile_layernorm_bwd_data.cpp
#profile_layernorm_bwd_gamma_beta.cpp
#profile_groupnorm_bwd_gamma_beta.cpp
#profile_layernorm_fwd.cpp
#profile_max_pool3d_fwd.cpp
#profile_avg_pool3d_bwd.cpp
#profile_max_pool3d_bwd.cpp
#profile_softmax.cpp
#profile_batchnorm_fwd.cpp
#profile_batchnorm_bwd.cpp
#profile_batchnorm_infer.cpp
#profile_grouped_conv_bwd_data.cpp
#profile_conv_tensor_rearrange.cpp
#profile_transpose.cpp
#profile_permute_scale.cpp
profile_gemm_splitk.cpp
profile_gemm_bias_add_reduce.cpp
profile_gemm_add_multiply.cpp
profile_gemm_multiply_add.cpp
profile_gemm_reduce.cpp
profile_batched_gemm.cpp
profile_batched_gemm_reduce.cpp
profile_conv_fwd.cpp
profile_conv_fwd_bias_relu.cpp
profile_conv_fwd_bias_relu_add.cpp
profile_conv_bwd_data.cpp
profile_grouped_conv_fwd.cpp
profile_grouped_conv_bwd_weight.cpp
profile_reduce.cpp
profile_groupnorm_bwd_data.cpp
profile_groupnorm_fwd.cpp
profile_layernorm_bwd_data.cpp
profile_layernorm_bwd_gamma_beta.cpp
profile_groupnorm_bwd_gamma_beta.cpp
profile_layernorm_fwd.cpp
profile_max_pool3d_fwd.cpp
profile_avg_pool3d_bwd.cpp
profile_max_pool3d_bwd.cpp
profile_softmax.cpp
profile_batchnorm_fwd.cpp
profile_batchnorm_bwd.cpp
profile_batchnorm_infer.cpp
profile_grouped_conv_bwd_data.cpp
profile_conv_tensor_rearrange.cpp
profile_transpose.cpp
profile_permute_scale.cpp
)
#if(DL_KERNELS)
# list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
#endif()
#
#if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
# list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
# list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
# list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
# list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
# list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
#endif()
#
#if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
# list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
# list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
#endif()
if(DL_KERNELS)
list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
endif()
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
endif()
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
endif()
set(PROFILER_EXECUTABLE ckProfiler)
......@@ -68,67 +68,66 @@ target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
#
#if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
#endif()
#
#
#
#if(DL_KERNELS)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
#endif()
#
#if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
#endif()
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
endif()
if(DL_KERNELS)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
endif()
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
endif()
rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102)
list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1103)
set(target 0)
foreach(gpu IN LISTS GPU_TARGETS)
......
......@@ -14,7 +14,7 @@ target_link_libraries(test_wrapper_partition PRIVATE utility)
add_dependencies(test_wrapper test_wrapper_partition)
if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR
GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR
GPU_TARGETS MATCHES "gfx942")
GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx950")
add_gtest_executable(test_wrapper_gemm test_wrapper_gemm.cpp)
target_link_libraries(test_wrapper_gemm PRIVATE utility)
add_dependencies(test_wrapper test_wrapper_gemm)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment