resolve conflicts

5cb59d36 · Jing Zhang · 7e3a5613 · 7e147c64 · 5cb59d36 · 5cb59d36
Commit 5cb59d36 authored Apr 07, 2024 by Jing Zhang
16 changed files
--- a/include/ck/utility/synchronization.hpp
+++ b/include/ck/utility/synchronization.hpp
@@ -12,9 +12,9 @@ __device__ void block_sync_lds()
 #if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
 #ifdef __gfx12__
    asm volatile("\
-	s_wait_dscnt 0x0 \n \
-	s_barrier_signal -1 \n \
-	s_barrier_wait -1 \
+    s_wait_dscnt 0x0 \n \
+    s_barrier_signal -1 \n \
+    s_barrier_wait -1 \
    " ::);
 #else
    asm volatile("\
@@ -29,11 +29,20 @@ __device__ void block_sync_lds()

 __device__ void block_sync_lds_direct_load()
 {
+#ifdef __gfx12__
+    asm volatile("\
+    s_wait_vmcnt 0x0 \n \
+    s_wait_dscnt 0x0 \n \
+    s_barrier_signal -1 \n \
+    s_barrier_wait -1 \
+    " ::);
+#else
    asm volatile("\
    s_waitcnt vmcnt(0) \n \
    s_waitcnt lgkmcnt(0) \n \
    s_barrier \
    " ::);
+#endif
 }

 __device__ void s_nop()

--- a/include/ck/wrapper/layout.hpp
+++ b/include/ck/wrapper/layout.hpp
@@ -5,8 +5,11 @@

 #include "ck/wrapper/utils/layout_utils.hpp"

+// Disable from doxygen docs generation
+/// @cond INTERNAL
 namespace ck {
 namespace wrapper {
+/// @endcond

 /**
 * \brief Layout wrapper that performs the tensor descriptor logic.
@@ -19,6 +22,8 @@ namespace wrapper {
 template <typename Shape, typename UnrolledDescriptorType>
 struct Layout
 {
+    // Disable from doxygen docs generation
+    /// @cond INTERNAL
    private:
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
@@ -246,6 +251,7 @@ struct Layout
    using Descriptor1dType =
        remove_cvref_t<decltype(MakeMerge1d(Shape{}, UnrolledDescriptorType{}))>;
    using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>;
+    /// @endcond

    public:
    using LayoutShape                  = Shape;
@@ -457,6 +463,8 @@ struct Layout
        return unrolled_descriptor_;
    }

+    // Disable from doxygen docs generation
+    /// @cond INTERNAL
    private:
    // All dimensions are unrolled
    UnrolledDescriptorType unrolled_descriptor_;
@@ -469,6 +477,7 @@ struct Layout
    // Descriptor1dType lengths: (8)
    // MergedNestsDescriptorType lengths: (4, 2)
    const Shape shape_;
+    /// @endcond
 };

 } // namespace wrapper

--- a/include/ck/wrapper/operations/copy.hpp
+++ b/include/ck/wrapper/operations/copy.hpp
@@ -12,8 +12,11 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_description/tensor_space_filling_curve.hpp"

+// Disable from doxygen docs generation
+/// @cond INTERNAL
 namespace ck {
 namespace wrapper {
+/// @endcond

 /**
 * \brief Perform optimized copy between two tensors partitions (threadwise copy).

--- a/include/ck/wrapper/operations/gemm.hpp
+++ b/include/ck/wrapper/operations/gemm.hpp
@@ -9,9 +9,14 @@
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"

+// Disable from doxygen docs generation
+/// @cond INTERNAL
 namespace ck {
 namespace wrapper {
+/// @endcond

+// Disable from doxygen docs generation
+/// @cond INTERNAL
 namespace {
 namespace detail {
 /**
@@ -45,6 +50,7 @@ __device__ constexpr auto GetBlockDescriptor()

 } // namespace detail
 } // namespace
+/// @endcond

 /**
 * \brief Perform blockwise gemm xdl on tensors stored in lds. Result will be

--- a/include/ck/wrapper/tensor.hpp
+++ b/include/ck/wrapper/tensor.hpp
@@ -7,9 +7,14 @@
 #include "utils/tensor_partition.hpp"
 #include "utils/layout_utils.hpp"

+// Disable from doxygen docs generation
+/// @cond INTERNAL
 namespace ck {
 namespace wrapper {
+/// @endcond

+// Disable from doxygen docs generation
+/// @cond INTERNAL
 namespace {
 namespace detail {
 /**
@@ -189,6 +194,7 @@ __host__ __device__ constexpr auto GenerateSlicedDescriptor(const Tuple<Ts...>&
 }
 } // namespace detail
 } // namespace
+/// @endcond

 /**
 * \brief Tensor wrapper that performs static and dynamic buffer logic.
@@ -394,6 +400,8 @@ struct Tensor
    }

    private:
+    // Disable from doxygen docs generation
+    /// @cond INTERNAL
    using DynamicBufferType = DynamicBuffer<BufferAddressSpace,
                                            ElementType,
                                            ElementSpaceSize,
@@ -428,6 +436,7 @@ struct Tensor
    // tensor descriptor (thus all it's transforms) and is linear (1D).
    // We store base_offset_ to avoid multiple recalculations.
    index_t base_offset_;
+    /// @endcond
 };

 } // namespace wrapper

--- a/include/ck/wrapper/traits/blockwise_gemm_xdl_traits.hpp
+++ b/include/ck/wrapper/traits/blockwise_gemm_xdl_traits.hpp
@@ -5,8 +5,11 @@

 #include "ck/ck.hpp"

+// Disable from doxygen docs generation
+/// @cond INTERNAL
 namespace ck {
 namespace wrapper {
+/// @endcond

 /**
 * \brief Traits for blockwise gemm xdl.

--- a/include/ck/wrapper/utils/kernel_utils.hpp
+++ b/include/ck/wrapper/utils/kernel_utils.hpp
@@ -5,8 +5,11 @@

 #include "ck/ck.hpp"

+// Disable from doxygen docs generation
+/// @cond INTERNAL
 namespace ck {
 namespace wrapper {
+/// @endcond

 #define __CK_WRAPPER_LAUNCH_BOUNDS__ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)


--- a/include/ck/wrapper/utils/layout_utils.hpp
+++ b/include/ck/wrapper/utils/layout_utils.hpp
@@ -17,11 +17,14 @@
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"

+// Disable from doxygen docs generation
+/// @cond INTERNAL
 namespace ck {
 namespace wrapper {
+/// @endcond

 // Disable from doxygen docs generation
-/// @cond
+/// @cond INTERNAL
 // forward declaration
 template <typename Shape, typename UnrolledDescriptorType>
 struct Layout;

--- a/include/ck/wrapper/utils/tensor_partition.hpp
+++ b/include/ck/wrapper/utils/tensor_partition.hpp
@@ -9,9 +9,14 @@
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 #include "ck/tensor_description/cluster_descriptor.hpp"

+// Disable from doxygen docs generation
+/// @cond INTERNAL
 namespace ck {
 namespace wrapper {
+/// @endcond

+// Disable from doxygen docs generation
+/// @cond INTERNAL
 namespace {

 namespace detail {
@@ -236,6 +241,7 @@ __host__ __device__ constexpr auto CalculateThreadMultiIdx(
 }
 } // namespace detail
 } // namespace
+/// @endcond

 /**
 * \brief Create local partition for thread (At now only packed partition

--- a/include/ck/wrapper/utils/tensor_utils.hpp
+++ b/include/ck/wrapper/utils/tensor_utils.hpp
@@ -13,8 +13,11 @@
 #include "ck/utility/amd_address_space.hpp"
 #include "ck/utility/multi_index.hpp"

+// Disable from doxygen docs generation
+/// @cond INTERNAL
 namespace ck {
 namespace wrapper {
+/// @endcond

 /**
 * \brief Memory type, allowed members:
@@ -27,7 +30,7 @@ namespace wrapper {
 using MemoryTypeEnum = AddressSpaceEnum;

 // Disable from doxygen docs generation
-/// @cond
+/// @cond INTERNAL
 // forward declarations
 template <typename Shape, typename UnrolledDescriptorType>
 struct Layout;

--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -2,64 +2,64 @@
 set(PROFILER_SOURCES
    profiler.cpp
    profile_gemm.cpp
-   #profile_gemm_splitk.cpp
-   #profile_gemm_bias_add_reduce.cpp
-   #profile_gemm_add_multiply.cpp
-   #profile_gemm_multiply_add.cpp
-   #profile_gemm_reduce.cpp
-   #profile_batched_gemm.cpp
-   #profile_batched_gemm_reduce.cpp
-   #profile_conv_fwd.cpp
-   #profile_conv_fwd_bias_relu.cpp
-   #profile_conv_fwd_bias_relu_add.cpp
-   #profile_conv_bwd_data.cpp
-   #profile_grouped_conv_fwd.cpp
-   #profile_grouped_conv_bwd_weight.cpp
-   #profile_reduce.cpp
-   #profile_groupnorm_bwd_data.cpp
-   #profile_groupnorm_fwd.cpp
-   #profile_layernorm_bwd_data.cpp
-   #profile_layernorm_bwd_gamma_beta.cpp
-   #profile_groupnorm_bwd_gamma_beta.cpp
-   #profile_layernorm_fwd.cpp
-   #profile_max_pool3d_fwd.cpp
-   #profile_avg_pool3d_bwd.cpp
-   #profile_max_pool3d_bwd.cpp
-   #profile_softmax.cpp
-   #profile_batchnorm_fwd.cpp
-   #profile_batchnorm_bwd.cpp
-   #profile_batchnorm_infer.cpp
-   #profile_grouped_conv_bwd_data.cpp
-   #profile_conv_tensor_rearrange.cpp
-   #profile_transpose.cpp
-   #profile_permute_scale.cpp
+    profile_gemm_splitk.cpp
+    profile_gemm_bias_add_reduce.cpp
+    profile_gemm_add_multiply.cpp
+    profile_gemm_multiply_add.cpp
+    profile_gemm_reduce.cpp
+    profile_batched_gemm.cpp
+    profile_batched_gemm_reduce.cpp
+    profile_conv_fwd.cpp
+    profile_conv_fwd_bias_relu.cpp
+    profile_conv_fwd_bias_relu_add.cpp
+    profile_conv_bwd_data.cpp
+    profile_grouped_conv_fwd.cpp
+    profile_grouped_conv_bwd_weight.cpp
+    profile_reduce.cpp
+    profile_groupnorm_bwd_data.cpp
+    profile_groupnorm_fwd.cpp
+    profile_layernorm_bwd_data.cpp
+    profile_layernorm_bwd_gamma_beta.cpp
+    profile_groupnorm_bwd_gamma_beta.cpp
+    profile_layernorm_fwd.cpp
+    profile_max_pool3d_fwd.cpp
+    profile_avg_pool3d_bwd.cpp
+    profile_max_pool3d_bwd.cpp
+    profile_softmax.cpp
+    profile_batchnorm_fwd.cpp
+    profile_batchnorm_bwd.cpp
+    profile_batchnorm_infer.cpp
+    profile_grouped_conv_bwd_data.cpp
+    profile_conv_tensor_rearrange.cpp
+    profile_transpose.cpp
+    profile_permute_scale.cpp
 )

-#if(DL_KERNELS)
-#  list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
-#endif()
-#
-#if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-#  list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
-#  list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
-#  list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
-#  list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
-#  list(APPEND PROFILER_SOURCES profile_gemm_add.cpp) 
-#  list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
-#  list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp)
-#  list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
-#  list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
-#  list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
-#  list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
-#  list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
-#  list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
-#  list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
-#endif()
-#
-#if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
-#  list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
-#  list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
-#endif()
+if(DL_KERNELS)
+  list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
+endif()
+
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_add.cpp) 
+  list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
+  list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
+  list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
+  list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
+  list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
+endif()
+
+if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
+  list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
+  list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
+endif()

 set(PROFILER_EXECUTABLE ckProfiler)

@@ -68,67 +68,65 @@ target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)

 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
-#target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
-#
-#if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
-#endif()
-#
-#
-#
-#if(DL_KERNELS)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
-#endif()
-#
-#if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
-#  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
-#endif()
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
+
+if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
+endif()
+
+if(DL_KERNELS)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
+endif()
+
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
+endif()

 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -10,8 +10,8 @@ cmake
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
 -D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
-D BUILD_DEV=OFF                                                                                   \
-D GPU_TARGETS="gfx1200"                                                             \
+-D BUILD_DEV=ON                                                                                   \
+-D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}
--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
@@ -16,4 +16,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
        target_link_libraries(test_grouped_convnd_bwd_data_interface PRIVATE utility device_grouped_conv2d_bwd_data_instance)
        set(target 1)
    endif()
-endforeach()
\ No newline at end of file
+endforeach()
--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
 list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
-list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102)
+list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1103)

 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)

--- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp
@@ -55,7 +55,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test
            }
        }

-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
        {
            // on navi3x only support for 3d is implemented
            if constexpr(NDimSpatial{} != 3)

--- a/test/wrapper/CMakeLists.txt
+++ b/test/wrapper/CMakeLists.txt
@@ -15,6 +15,7 @@ add_dependencies(test_wrapper test_wrapper_partition)
 if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR
   GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR
   GPU_TARGETS MATCHES "gfx942" OR GPU_TARGETS MATCHES "gfx950")
-    add_gtest_executable(test_gemm test_gemm.cpp)
-    target_link_libraries(test_gemm PRIVATE utility)
+    add_gtest_executable(test_wrapper_gemm test_wrapper_gemm.cpp)
+    target_link_libraries(test_wrapper_gemm PRIVATE utility)
+    add_dependencies(test_wrapper test_wrapper_gemm)
 endif()