Merge branch 'develop' into bwroblew/direct_load_double_buf

20ea6c75 · Bartlomiej Wroblewski · GitHub · 92a0393a · 8ff845f2 · 20ea6c75
Unverified Commit 20ea6c75 authored Nov 30, 2023 by Bartlomiej Wroblewski Committed by GitHub Nov 30, 2023
5 changed files
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -5,6 +5,7 @@

 #include "functional4.hpp"
 #include "tuple.hpp"
+#include "is_detected.hpp"

 namespace ck {

@@ -33,6 +34,28 @@ __host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple<X&...>&
        ty);
 }

+template <typename... X, typename... Y>
+__host__ __device__ constexpr auto concat_tuple(const Tuple<X...>& tx, const Tuple<Y...>& ty)
+{
+    return unpack2(
+        [&](auto... zs) { return Tuple<decltype(zs)...>{std::forward<decltype(zs)>(zs)...}; },
+        tx,
+        ty);
+}
+
+// Support any number of tuples to concat (also 1)
+template <typename... X>
+__host__ __device__ constexpr auto concat_tuple(const Tuple<X...>& tx)
+{
+    return tx;
+}
+
+template <typename... X, typename... Tuples>
+__host__ __device__ constexpr auto concat_tuple(const Tuple<X...>& tx, const Tuples&... tuples)
+{
+    return concat_tuple(tx, concat_tuple(tuples...));
+}
+
 namespace detail {

 template <typename F, typename X, index_t... Is>
@@ -78,4 +101,69 @@ __host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y,
        f, x, y, z, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
 }

+// By default unroll to the flatten
+template <index_t Depth = 0, index_t MaxDepth = -1>
+__host__ __device__ constexpr auto UnrollNestedTuple(const Tuple<>& element)
+{
+    return element;
+}
+
+template <index_t Depth = 0, index_t MaxDepth = -1, typename T>
+__host__ __device__ constexpr auto UnrollNestedTuple(const T& element)
+{
+    return make_tuple(element);
+}
+
+template <index_t Depth = 0, index_t MaxDepth = -1, typename... Ts>
+__host__ __device__ constexpr auto UnrollNestedTuple(const Tuple<Ts...>& tuple)
+{
+    if constexpr(Depth == MaxDepth)
+    {
+        return tuple;
+    }
+    else
+    {
+        return unpack(
+            [&](auto&&... ts) {
+                return concat_tuple(UnrollNestedTuple<Depth + 1, MaxDepth>(ts)...);
+            },
+            tuple);
+    }
+}
+
+template <typename... Ts>
+__host__ __device__ constexpr auto TupleReverse(const Tuple<Ts...>& tuple)
+{
+    return generate_tuple(
+        [&](auto i) {
+            using Idx = Number<Tuple<Ts...>::Size() - i - 1>;
+            return tuple.At(Idx{});
+        },
+        Number<Tuple<Ts...>::Size()>{});
+}
+
+// Reduce tuple values in specific range using Function
+template <index_t Idx, index_t End, typename F, typename... Ts>
+__host__ __device__ constexpr auto TupleReduce(F&& f, const Tuple<Ts...>& tuple)
+{
+    static_assert(Idx < End, "Wrong parameters for TupleReduce");
+    if constexpr(Idx + 1 == End)
+    {
+        return tuple.At(Number<Idx>{});
+    }
+    else
+    {
+        return f(tuple.At(Number<Idx>{}), TupleReduce<Idx + 1, End>(f, tuple));
+    }
+}
+
+template <typename T>
+using is_tuple = decltype(std::declval<T&>().IsTuple());
+
+template <typename... Ts>
+__host__ __device__ constexpr auto IsNestedTuple(const Tuple<Ts...>&)
+{
+    return (is_detected<is_tuple, Ts>::value || ...);
+}
+
 } // namespace ck
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -58,7 +58,12 @@ endfunction(add_instance_library INSTANCE_NAME)


 file(GLOB dir_list LIST_DIRECTORIES true *)
-set(CK_DEVICE_INSTANCES)
+set(CK_DEVICE_OTHER_INSTANCES)
+set(CK_DEVICE_GEMM_INSTANCES)
+set(CK_DEVICE_CONV_INSTANCES)
+set(CK_DEVICE_MHA_INSTANCES)
+set(CK_DEVICE_CONTRACTION_INSTANCES)
+set(CK_DEVICE_REDUCTION_INSTANCES)
 FOREACH(subdir_path ${dir_list})
    set(target_dir)
    IF(IS_DIRECTORY "${subdir_path}")
@@ -122,7 +127,19 @@ FOREACH(subdir_path ${dir_list})
        if((add_inst EQUAL 1))
            get_filename_component(target_dir ${subdir_path} NAME)
            add_subdirectory(${target_dir})
-            list(APPEND CK_DEVICE_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
+            if("${cmake_instance}" MATCHES "gemm")
+                list(APPEND CK_DEVICE_GEMM_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
+            elseif("${cmake_instance}" MATCHES "conv")
+                 list(APPEND CK_DEVICE_CONV_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
+            elseif("${cmake_instance}" MATCHES "mha")
+                 list(APPEND CK_DEVICE_MHA_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
+            elseif("${cmake_instance}" MATCHES "contr")
+                 list(APPEND CK_DEVICE_CONTRACTION_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
+            elseif("${cmake_instance}" MATCHES "reduce")
+                 list(APPEND CK_DEVICE_REDUCTION_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
+            else()
+                 list(APPEND CK_DEVICE_OTHER_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
+            endif()
            message("add_instance_directory ${subdir_path}")
        else()
            message("skip_instance_directory ${subdir_path}")
@@ -130,50 +147,138 @@ FOREACH(subdir_path ${dir_list})
    ENDIF()
 ENDFOREACH()

-add_library(device_operations STATIC ${CK_DEVICE_INSTANCES})
-add_library(composablekernels::device_operations ALIAS device_operations)


+if(CK_DEVICE_OTHER_INSTANCES)
+        add_library(device_other_operations STATIC ${CK_DEVICE_OTHER_INSTANCES})
+        add_library(composablekernels::device_other_operations ALIAS device_other_operations)
+        target_compile_features(device_other_operations PUBLIC)
+        set_target_properties(device_other_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_include_directories(device_other_operations PUBLIC
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_description>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/problem_transform>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device/impl>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/grid>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/block>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/warp>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/thread>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/element>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/utility>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/quantization>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/softmax>
+        )
+        rocm_install(TARGETS device_other_operations
+            EXPORT device_other_operationsTargets)
+        rocm_install(EXPORT device_other_operationsTargets
+            FILE composable_kerneldevice_other_operationsTargets.cmake
+            NAMESPACE composable_kernel::
+            DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+        )
+endif()
+if(CK_DEVICE_GEMM_INSTANCES)
+        add_library(device_gemm_operations STATIC ${CK_DEVICE_GEMM_INSTANCES})
+        add_library(composablekernels::device_gemm_operations ALIAS device_gemm_operations)
+        target_compile_features(device_gemm_operations PUBLIC)
+        set_target_properties(device_gemm_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_include_directories(device_gemm_operations PUBLIC
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
+        )
+        rocm_install(TARGETS device_gemm_operations
+            EXPORT device_gemm_operationsTargets)
+        rocm_install(EXPORT device_gemm_operationsTargets
+            FILE composable_kerneldevice_gemm_operationsTargets.cmake
+            NAMESPACE composable_kernel::
+            DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+        )
+endif()
+if(CK_DEVICE_CONV_INSTANCES)
+        add_library(device_conv_operations STATIC ${CK_DEVICE_CONV_INSTANCES})
+        add_library(composablekernels::device_conv_operations ALIAS device_conv_operations)
+        target_compile_features(device_conv_operations PUBLIC)
+        set_target_properties(device_conv_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_include_directories(device_conv_operations PUBLIC
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_data>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd>
+        )
+        rocm_install(TARGETS device_conv_operations
+            EXPORT device_conv_operationsTargets)
+        rocm_install(EXPORT device_conv_operationsTargets
+            FILE composable_kerneldevice_conv_operationsTargets.cmake
+            NAMESPACE composable_kernel::
+            DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+        )
+endif()
+if(CK_DEVICE_MHA_INSTANCES)
+        add_library(device_mha_operations STATIC ${CK_DEVICE_MHA_INSTANCES})
+        add_library(composablekernels::device_mha_operations ALIAS device_mha_operations)
+        target_compile_features(device_mha_operations PUBLIC)
+        set_target_properties(device_mha_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_include_directories(device_mha_operations PUBLIC
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/mha>
+        )
+        rocm_install(TARGETS device_mha_operations
+            EXPORT device_mha_operationsTargets)
+        rocm_install(EXPORT device_mha_operationsTargets
+            FILE composable_kerneldevice_mha_operationsTargets.cmake
+            NAMESPACE composable_kernel::
+            DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+        )
+endif()
+if(CK_DEVICE_CONTRACTION_INSTANCES)
+        add_library(device_contraction_operations STATIC ${CK_DEVICE_CONTRACTION_INSTANCES})
+        add_library(composablekernels::device_contraction_operations ALIAS device_contraction_operations)
+        target_compile_features(device_contraction_operations PUBLIC)
+        set_target_properties(device_contraction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_include_directories(device_contraction_operations PUBLIC
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/contraction>
+        )
+        rocm_install(TARGETS device_contraction_operations
+            EXPORT device_contraction_operationsTargets)
+        rocm_install(EXPORT device_contraction_operationsTargets
+            FILE composable_kerneldevice_contraction_operationsTargets.cmake
+            NAMESPACE composable_kernel::
+            DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+        )
+endif()
+if(CK_DEVICE_REDUCTION_INSTANCES)
+        add_library(device_reduction_operations STATIC ${CK_DEVICE_REDUCTION_INSTANCES})
+        add_library(composablekernels::device_reduction_operations ALIAS device_reduction_operations)
+        target_compile_features(device_reduction_operations PUBLIC)
+        set_target_properties(device_reduction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_include_directories(device_reduction_operations PUBLIC
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce>
+        )
+        rocm_install(TARGETS device_reduction_operations
+            EXPORT device_reduction_operationsTargets)
+        rocm_install(EXPORT device_reduction_operationsTargets
+            FILE composable_kerneldevice_reduction_operationsTargets.cmake
+            NAMESPACE composable_kernel::
+            DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+        )
+endif()
+
+add_library(device_operations INTERFACE)
+target_link_libraries(device_operations INTERFACE
+    device_contraction_operations
+    device_conv_operations
+    device_gemm_operations
+    device_other_operations
+    device_reduction_operations
+    utility)
+
 set(DEV_OPS_INC_DIRS
    ${PROJECT_SOURCE_DIR}/include/ck/
    ${PROJECT_SOURCE_DIR}/library/include/ck/
 )
-
-target_compile_features(device_operations PUBLIC)
-set_target_properties(device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
-target_include_directories(device_operations PUBLIC
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_description>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/problem_transform>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device/impl>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/grid>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/block>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/warp>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/thread>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/element>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/utility>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce>
-)
-
-#once new arches are enabled make this an option on the main cmake file
-# and pass down here to be exported
-target_compile_options(device_operations PRIVATE
-    --offload-arch=gfx908
-    --offload-arch=gfx90a
-)
-
-# install(TARGETS device_operations LIBRARY DESTINATION lib)
-rocm_install(TARGETS device_operations
-    EXPORT device_operationsTargets)
-
 rocm_install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
-rocm_install(EXPORT device_operationsTargets
-    FILE composable_kerneldevice_operationsTargets.cmake
-    NAMESPACE composable_kernel::
-    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
-)
+
--- a/profiler/include/profiler/profile_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
@@ -166,7 +166,7 @@ int profile_gemm_impl(int do_verification,
            std::string op_name = op_ptr->GetTypeString();

            float avg_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, 10, 50});

            std::size_t flop = std::size_t(2) * M * N * K;


--- a/profiler/src/profile_transpose.cpp
+++ b/profiler/src/profile_transpose.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_transpose_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct MatrixLayout
+{
+    NCDHW, // 0
+    NCHWD, // 1
+};
+
+enum struct DataType
+{
+    F32_F32_F32_F32_F32, // 0
+    F16_F16_F16_F16_F16, // 1
+};
+
+#define OP_NAME "transpose"
+#define OP_DESC "Transpose"
+
+int profile_transpose(int argc, char* argv[])
+{
+    if(argc != 15)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        // printf("arg3: matrix layout (NCDHW -> NDCHW);\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 13: N, C, D, H, W\n");
+        exit(1);
+    }
+
+    const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
+    // const auto layout          = static_cast<MatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification   = std::stoi(argv[3]);
+    const int init_method        = std::stoi(argv[4]);
+    const bool do_log            = std::stoi(argv[5]);
+    const bool time_kernel       = std::stoi(argv[6]);
+    std::vector<index_t> lengths = std::stoi(argv[7]);
+
+    /**const int N = std::stoi(argv[7]);
+    const int C = std::stoi(argv[8]);
+    const int D = std::stoi(argv[9]);
+    const int H = std::stoi(argv[10]);
+    const int W = std::stoi(argv[11]);**/
+
+    using F32 = float;
+    using F16 = ck::half_t;
+
+    auto profile = [&](auto a_type, auto b_type) {
+        using ADataType = decltype(a_type);
+        using BDataType = decltype(b_type);
+
+        bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType>(
+            do_verification, init_method, do_log, time_kernel, lengths);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == GemmDataType::F32_F32_F32_F32_F32)
+    {
+        return profile(F32{}, F32{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16_F16_F16)
+    {
+        return profile(F16{}, F16{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_transpose);
--- a/test/transpose/test_transpose_ut_cases.inc
+++ b/test/transpose/test_transpose_ut_cases.inc
@@ -14,7 +14,6 @@ TYPED_TEST(TestTranspose, Test1)
    this->Run();
 }

-
 TYPED_TEST(TestTranpose, Test2)
 {
    std::vector<int> Ms{127, 255, 312, 799, 1573};
@@ -27,4 +26,3 @@ TYPED_TEST(TestTranpose, Test2)

    this->Run();
 }
-