Commit 5cb59d36 authored by Jing Zhang's avatar Jing Zhang
Browse files

resolve conflicts

parents 7e3a5613 7e147c64
...@@ -12,9 +12,9 @@ __device__ void block_sync_lds() ...@@ -12,9 +12,9 @@ __device__ void block_sync_lds()
#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM #if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
#ifdef __gfx12__ #ifdef __gfx12__
asm volatile("\ asm volatile("\
s_wait_dscnt 0x0 \n \ s_wait_dscnt 0x0 \n \
s_barrier_signal -1 \n \ s_barrier_signal -1 \n \
s_barrier_wait -1 \ s_barrier_wait -1 \
" ::); " ::);
#else #else
asm volatile("\ asm volatile("\
...@@ -29,11 +29,20 @@ __device__ void block_sync_lds() ...@@ -29,11 +29,20 @@ __device__ void block_sync_lds()
__device__ void block_sync_lds_direct_load() __device__ void block_sync_lds_direct_load()
{ {
#ifdef __gfx12__
asm volatile("\
s_wait_vmcnt 0x0 \n \
s_wait_dscnt 0x0 \n \
s_barrier_signal -1 \n \
s_barrier_wait -1 \
" ::);
#else
asm volatile("\ asm volatile("\
s_waitcnt vmcnt(0) \n \ s_waitcnt vmcnt(0) \n \
s_waitcnt lgkmcnt(0) \n \ s_waitcnt lgkmcnt(0) \n \
s_barrier \ s_barrier \
" ::); " ::);
#endif
} }
__device__ void s_nop() __device__ void s_nop()
......
...@@ -5,8 +5,11 @@ ...@@ -5,8 +5,11 @@
#include "ck/wrapper/utils/layout_utils.hpp" #include "ck/wrapper/utils/layout_utils.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
/// @endcond
/** /**
* \brief Layout wrapper that performs the tensor descriptor logic. * \brief Layout wrapper that performs the tensor descriptor logic.
...@@ -19,6 +22,8 @@ namespace wrapper { ...@@ -19,6 +22,8 @@ namespace wrapper {
template <typename Shape, typename UnrolledDescriptorType> template <typename Shape, typename UnrolledDescriptorType>
struct Layout struct Layout
{ {
// Disable from doxygen docs generation
/// @cond INTERNAL
private: private:
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{}; static constexpr auto I1 = Number<1>{};
...@@ -246,6 +251,7 @@ struct Layout ...@@ -246,6 +251,7 @@ struct Layout
using Descriptor1dType = using Descriptor1dType =
remove_cvref_t<decltype(MakeMerge1d(Shape{}, UnrolledDescriptorType{}))>; remove_cvref_t<decltype(MakeMerge1d(Shape{}, UnrolledDescriptorType{}))>;
using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>; using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>;
/// @endcond
public: public:
using LayoutShape = Shape; using LayoutShape = Shape;
...@@ -457,6 +463,8 @@ struct Layout ...@@ -457,6 +463,8 @@ struct Layout
return unrolled_descriptor_; return unrolled_descriptor_;
} }
// Disable from doxygen docs generation
/// @cond INTERNAL
private: private:
// All dimensions are unrolled // All dimensions are unrolled
UnrolledDescriptorType unrolled_descriptor_; UnrolledDescriptorType unrolled_descriptor_;
...@@ -469,6 +477,7 @@ struct Layout ...@@ -469,6 +477,7 @@ struct Layout
// Descriptor1dType lengths: (8) // Descriptor1dType lengths: (8)
// MergedNestsDescriptorType lengths: (4, 2) // MergedNestsDescriptorType lengths: (4, 2)
const Shape shape_; const Shape shape_;
/// @endcond
}; };
} // namespace wrapper } // namespace wrapper
......
...@@ -12,8 +12,11 @@ ...@@ -12,8 +12,11 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_description/tensor_space_filling_curve.hpp" #include "ck/tensor_description/tensor_space_filling_curve.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
/// @endcond
/** /**
* \brief Perform optimized copy between two tensors partitions (threadwise copy). * \brief Perform optimized copy between two tensors partitions (threadwise copy).
......
...@@ -9,9 +9,14 @@ ...@@ -9,9 +9,14 @@
#include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/device_prop.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp" #include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
/// @endcond
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace { namespace {
namespace detail { namespace detail {
/** /**
...@@ -45,6 +50,7 @@ __device__ constexpr auto GetBlockDescriptor() ...@@ -45,6 +50,7 @@ __device__ constexpr auto GetBlockDescriptor()
} // namespace detail } // namespace detail
} // namespace } // namespace
/// @endcond
/** /**
* \brief Perform blockwise gemm xdl on tensors stored in lds. Result will be * \brief Perform blockwise gemm xdl on tensors stored in lds. Result will be
......
...@@ -7,9 +7,14 @@ ...@@ -7,9 +7,14 @@
#include "utils/tensor_partition.hpp" #include "utils/tensor_partition.hpp"
#include "utils/layout_utils.hpp" #include "utils/layout_utils.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
/// @endcond
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace { namespace {
namespace detail { namespace detail {
/** /**
...@@ -189,6 +194,7 @@ __host__ __device__ constexpr auto GenerateSlicedDescriptor(const Tuple<Ts...>& ...@@ -189,6 +194,7 @@ __host__ __device__ constexpr auto GenerateSlicedDescriptor(const Tuple<Ts...>&
} }
} // namespace detail } // namespace detail
} // namespace } // namespace
/// @endcond
/** /**
* \brief Tensor wrapper that performs static and dynamic buffer logic. * \brief Tensor wrapper that performs static and dynamic buffer logic.
...@@ -394,6 +400,8 @@ struct Tensor ...@@ -394,6 +400,8 @@ struct Tensor
} }
private: private:
// Disable from doxygen docs generation
/// @cond INTERNAL
using DynamicBufferType = DynamicBuffer<BufferAddressSpace, using DynamicBufferType = DynamicBuffer<BufferAddressSpace,
ElementType, ElementType,
ElementSpaceSize, ElementSpaceSize,
...@@ -428,6 +436,7 @@ struct Tensor ...@@ -428,6 +436,7 @@ struct Tensor
// tensor descriptor (thus all it's transforms) and is linear (1D). // tensor descriptor (thus all it's transforms) and is linear (1D).
// We store base_offset_ to avoid multiple recalculations. // We store base_offset_ to avoid multiple recalculations.
index_t base_offset_; index_t base_offset_;
/// @endcond
}; };
} // namespace wrapper } // namespace wrapper
......
...@@ -5,8 +5,11 @@ ...@@ -5,8 +5,11 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
/// @endcond
/** /**
* \brief Traits for blockwise gemm xdl. * \brief Traits for blockwise gemm xdl.
......
...@@ -5,8 +5,11 @@ ...@@ -5,8 +5,11 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
/// @endcond
#define __CK_WRAPPER_LAUNCH_BOUNDS__ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #define __CK_WRAPPER_LAUNCH_BOUNDS__ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
......
...@@ -17,11 +17,14 @@ ...@@ -17,11 +17,14 @@
#include "ck/tensor_description/multi_index_transform_helper.hpp" #include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
/// @endcond
// Disable from doxygen docs generation // Disable from doxygen docs generation
/// @cond /// @cond INTERNAL
// forward declaration // forward declaration
template <typename Shape, typename UnrolledDescriptorType> template <typename Shape, typename UnrolledDescriptorType>
struct Layout; struct Layout;
......
...@@ -9,9 +9,14 @@ ...@@ -9,9 +9,14 @@
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
/// @endcond
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace { namespace {
namespace detail { namespace detail {
...@@ -236,6 +241,7 @@ __host__ __device__ constexpr auto CalculateThreadMultiIdx( ...@@ -236,6 +241,7 @@ __host__ __device__ constexpr auto CalculateThreadMultiIdx(
} }
} // namespace detail } // namespace detail
} // namespace } // namespace
/// @endcond
/** /**
* \brief Create local partition for thread (At now only packed partition * \brief Create local partition for thread (At now only packed partition
......
...@@ -13,8 +13,11 @@ ...@@ -13,8 +13,11 @@
#include "ck/utility/amd_address_space.hpp" #include "ck/utility/amd_address_space.hpp"
#include "ck/utility/multi_index.hpp" #include "ck/utility/multi_index.hpp"
// Disable from doxygen docs generation
/// @cond INTERNAL
namespace ck { namespace ck {
namespace wrapper { namespace wrapper {
/// @endcond
/** /**
* \brief Memory type, allowed members: * \brief Memory type, allowed members:
...@@ -27,7 +30,7 @@ namespace wrapper { ...@@ -27,7 +30,7 @@ namespace wrapper {
using MemoryTypeEnum = AddressSpaceEnum; using MemoryTypeEnum = AddressSpaceEnum;
// Disable from doxygen docs generation // Disable from doxygen docs generation
/// @cond /// @cond INTERNAL
// forward declarations // forward declarations
template <typename Shape, typename UnrolledDescriptorType> template <typename Shape, typename UnrolledDescriptorType>
struct Layout; struct Layout;
......
...@@ -2,64 +2,64 @@ ...@@ -2,64 +2,64 @@
set(PROFILER_SOURCES set(PROFILER_SOURCES
profiler.cpp profiler.cpp
profile_gemm.cpp profile_gemm.cpp
#profile_gemm_splitk.cpp profile_gemm_splitk.cpp
#profile_gemm_bias_add_reduce.cpp profile_gemm_bias_add_reduce.cpp
#profile_gemm_add_multiply.cpp profile_gemm_add_multiply.cpp
#profile_gemm_multiply_add.cpp profile_gemm_multiply_add.cpp
#profile_gemm_reduce.cpp profile_gemm_reduce.cpp
#profile_batched_gemm.cpp profile_batched_gemm.cpp
#profile_batched_gemm_reduce.cpp profile_batched_gemm_reduce.cpp
#profile_conv_fwd.cpp profile_conv_fwd.cpp
#profile_conv_fwd_bias_relu.cpp profile_conv_fwd_bias_relu.cpp
#profile_conv_fwd_bias_relu_add.cpp profile_conv_fwd_bias_relu_add.cpp
#profile_conv_bwd_data.cpp profile_conv_bwd_data.cpp
#profile_grouped_conv_fwd.cpp profile_grouped_conv_fwd.cpp
#profile_grouped_conv_bwd_weight.cpp profile_grouped_conv_bwd_weight.cpp
#profile_reduce.cpp profile_reduce.cpp
#profile_groupnorm_bwd_data.cpp profile_groupnorm_bwd_data.cpp
#profile_groupnorm_fwd.cpp profile_groupnorm_fwd.cpp
#profile_layernorm_bwd_data.cpp profile_layernorm_bwd_data.cpp
#profile_layernorm_bwd_gamma_beta.cpp profile_layernorm_bwd_gamma_beta.cpp
#profile_groupnorm_bwd_gamma_beta.cpp profile_groupnorm_bwd_gamma_beta.cpp
#profile_layernorm_fwd.cpp profile_layernorm_fwd.cpp
#profile_max_pool3d_fwd.cpp profile_max_pool3d_fwd.cpp
#profile_avg_pool3d_bwd.cpp profile_avg_pool3d_bwd.cpp
#profile_max_pool3d_bwd.cpp profile_max_pool3d_bwd.cpp
#profile_softmax.cpp profile_softmax.cpp
#profile_batchnorm_fwd.cpp profile_batchnorm_fwd.cpp
#profile_batchnorm_bwd.cpp profile_batchnorm_bwd.cpp
#profile_batchnorm_infer.cpp profile_batchnorm_infer.cpp
#profile_grouped_conv_bwd_data.cpp profile_grouped_conv_bwd_data.cpp
#profile_conv_tensor_rearrange.cpp profile_conv_tensor_rearrange.cpp
#profile_transpose.cpp profile_transpose.cpp
#profile_permute_scale.cpp profile_permute_scale.cpp
) )
#if(DL_KERNELS) if(DL_KERNELS)
# list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
#endif() endif()
#
#if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
# list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp) list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp) list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp) list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add.cpp) list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp) list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp) list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp) list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp) list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
# list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp) list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
# list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
# list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
# list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
# list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
#endif() endif()
#
#if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
# list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp) list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
# list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp) list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
#endif() endif()
set(PROFILER_EXECUTABLE ckProfiler) set(PROFILER_EXECUTABLE ckProfiler)
...@@ -68,67 +68,65 @@ target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors) ...@@ -68,67 +68,65 @@ target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
#
#if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
#endif() endif()
#
# if(DL_KERNELS)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
#if(DL_KERNELS) endif()
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
#endif() if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance)
#if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance) endif()
# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
#endif()
rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler) rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
...@@ -10,8 +10,8 @@ cmake ...@@ -10,8 +10,8 @@ cmake
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \ -D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D BUILD_DEV=OFF \ -D BUILD_DEV=ON \
-D GPU_TARGETS="gfx1200" \ -D GPU_TARGETS="gfx908;gfx90a;gfx940" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D USE_BITINT_EXTENSION_INT4=OFF \ -D USE_BITINT_EXTENSION_INT4=OFF \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
...@@ -16,4 +16,4 @@ foreach(gpu IN LISTS GPU_TARGETS) ...@@ -16,4 +16,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance) target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
set(target 1) set(target 1)
endif() endif()
endforeach() endforeach()
\ No newline at end of file
list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942 gfx950) list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102) list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1103)
set(target 0) set(target 0)
foreach(gpu IN LISTS GPU_TARGETS) foreach(gpu IN LISTS GPU_TARGETS)
......
...@@ -55,7 +55,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test ...@@ -55,7 +55,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
} }
} }
if(ck::is_navi3_supported()) if(ck::is_navi3_supported() || ck::is_navi4_supported())
{ {
// on navi3x only support for 3d is implemented // on navi3x only support for 3d is implemented
if constexpr(NDimSpatial{} != 3) if constexpr(NDimSpatial{} != 3)
......
...@@ -15,6 +15,7 @@ add_dependencies(test_wrapper test_wrapper_partition) ...@@ -15,6 +15,7 @@ add_dependencies(test_wrapper test_wrapper_partition)
if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR
GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR
GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx950") GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx950")
add_gtest_executable(test_gemm test_gemm.cpp) add_gtest_executable(test_wrapper_gemm test_wrapper_gemm.cpp)
target_link_libraries(test_gemm PRIVATE utility) target_link_libraries(test_wrapper_gemm PRIVATE utility)
add_dependencies(test_wrapper test_wrapper_gemm)
endif() endif()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment