Merge branch 'develop' into dl_conv_multiple_d

ca313a29 · letaoqin · d47bf127 · 8784a72e · ca313a29 · ca313a29
Commit ca313a29 authored Dec 02, 2022 by letaoqin
20 changed files
--- a/profiler/src/profiler_operation_registry.hpp
+++ b/profiler/src/profiler_operation_registry.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <optional>
+#include <string_view>
+#include <utility>
+class ProfilerOperationRegistry final
+{
+    ProfilerOperationRegistry()  = default;
+    ~ProfilerOperationRegistry() = default;
+    public:
+    using Operation = std::function<int(int, char*[])>;
+    private:
+    struct Entry final
+    {
+        explicit Entry(std::string_view description, Operation operation) noexcept
+            : description_(description), operation_(std::move(operation))
+        {
+        }
+        std::string_view description_;
+        Operation operation_;
+    };
+    std::map<std::string_view, Entry> entries_;
+    friend std::ostream& operator<<(std::ostream& stream, const ProfilerOperationRegistry& registry)
+    {
+        stream << "{\n";
+        for(auto& [name, entry] : registry.entries_)
+        {
+            stream << "\t" << name << ": " << entry.description_ << "\n";
+        }
+        stream << "}";
+        return stream;
+    }
+    public:
+    static ProfilerOperationRegistry& GetInstance()
+    {
+        static ProfilerOperationRegistry registry;
+        return registry;
+    }
+    std::optional<Operation> Get(std::string_view name) const
+    {
+        const auto found = entries_.find(name);
+        if(found == end(entries_))
+        {
+            return std::nullopt;
+        }
+        return (found->second).operation_;
+    }
+    bool Add(std::string_view name, std::string_view description, Operation operation)
+    {
+        return entries_
+            .emplace(std::piecewise_construct,
+                     std::forward_as_tuple(name),
+                     std::forward_as_tuple(description, std::move(operation)))
+            .second;
+    }
+};
+#define PP_CONCAT(x, y) PP_CONCAT_IMPL(x, y)
+#define PP_CONCAT_IMPL(x, y) x##y
+#define REGISTER_PROFILER_OPERATION(name, description, operation)              \
+    static const bool PP_CONCAT(operation_registration_result_, __COUNTER__) = \
+        ::ProfilerOperationRegistry::GetInstance().Add(name, description, operation)
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
 include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/
+    ${PROJECT_SOURCE_DIR}/profiler/include
 )
 include(googletest)
@@ -53,4 +54,4 @@ add_subdirectory(softmax)
 add_subdirectory(normalization)
 add_subdirectory(data_type)
 add_subdirectory(elementwise_normalization)
-add_subdirectory(batchnorm_fwd)
+add_subdirectory(batchnorm)
--- a/test/batched_gemm/batched_gemm_bf16.cpp
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
 namespace {
 using ADataType = ck::bhalf_t;

--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
 namespace {
 using ADataType = ck::half_t;

--- a/test/batched_gemm/batched_gemm_fp32.cpp
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
 namespace {
 using ADataType = float;

--- a/test/batched_gemm/batched_gemm_int8.cpp
+++ b/test/batched_gemm/batched_gemm_int8.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
-#include "profiler/include/profile_batched_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_impl.hpp"
 namespace {
 using ADataType = int8_t;

--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
@@ -6,7 +6,7 @@
 #include <vector>
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
-#include "profiler/include/profile_batched_gemm_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_gemm_impl.hpp"
 using ck::tensor_operation::device::GemmSpecialization;

--- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
-#include "profiler/include/profile_batched_gemm_reduce_impl.hpp"
+#include "profiler/profile_batched_gemm_reduce_impl.hpp"
 int main()
 {

--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
@@ -6,7 +6,7 @@
 #include <vector>
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
-#include "profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp"
+#include "profiler/profile_batched_gemm_softmax_gemm_impl.hpp"
 using ck::tensor_operation::device::GemmSpecialization;
 template <ck::index_t N>

--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
@@ -7,7 +7,7 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
-#include "profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
+#include "profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
 using ck::tensor_operation::device::GemmSpecialization;
 using ck::tensor_operation::device::MaskingSpecialization;

--- a/test/batchnorm_fwd/CMakeLists.txt
+++ b/test/batchnorm_fwd/CMakeLists.txt
 add_gtest_executable(test_batchnorm_fwd_rank_4 batchnorm_fwd_rank_4.cpp)
+add_gtest_executable(test_batchnorm_bwd_rank_4 batchnorm_bwd_rank_4.cpp)
 target_link_libraries(test_batchnorm_fwd_rank_4 PRIVATE utility device_batchnorm_instance)
+target_link_libraries(test_batchnorm_bwd_rank_4 PRIVATE utility device_batchnorm_instance)
--- a/test/batchnorm/batchnorm_bwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <tuple>
+#include <gtest/gtest.h>
+#include "profiler/profile_batchnorm_backward_impl.hpp"
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ck::bhalf_t;
+using F64  = double;
+template <typename Tuple>
+class TestBatchNormBwdRank4 : public ::testing::Test
+{
+    private:
+    const double epsilon = std::numeric_limits<float>::epsilon();
+    protected:
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using DxDataType      = std::tuple_element_t<1, Tuple>;
+    using DyDataType      = std::tuple_element_t<2, Tuple>;
+    using AccDataType     = std::tuple_element_t<3, Tuple>;
+    using ScaleDataType   = std::tuple_element_t<4, Tuple>;
+    using BiasDataType    = std::tuple_element_t<5, Tuple>;
+    using MeanVarDataType = std::tuple_element_t<6, Tuple>;
+    std::vector<std::vector<size_t>> list_of_lengths = {
+        {128, 16, 3, 1024}, {128, 16, 6, 512}, {1, 1, 1, 1}, {4, 4, 4, 4}, {32, 32, 32, 32}};
+    std::vector<int> reduceDims;
+    template <int NumReduceDim>
+    void Run()
+    {
+        for(auto& inOutLengths : list_of_lengths)
+        {
+            bool pass = true;
+            EXPECT_FALSE(reduceDims.size() != NumReduceDim);
+            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                                         DxDataType,
+                                                                         DyDataType,
+                                                                         AccDataType,
+                                                                         ScaleDataType,
+                                                                         BiasDataType,
+                                                                         MeanVarDataType,
+                                                                         4,
+                                                                         NumReduceDim>(
+                               true, 3, false, false, inOutLengths, reduceDims, true, epsilon);
+            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                                         DxDataType,
+                                                                         DyDataType,
+                                                                         AccDataType,
+                                                                         ScaleDataType,
+                                                                         BiasDataType,
+                                                                         MeanVarDataType,
+                                                                         4,
+                                                                         NumReduceDim>(
+                               true, 3, false, false, inOutLengths, reduceDims, false, epsilon);
+            EXPECT_TRUE(pass);
+        }
+    }
+};
+using KernelTypes = ::testing::Types<std::tuple<F16, F32, F32, F32, F16, F32, F32>,
+                                     std::tuple<F32, F32, F32, F32, F32, F32, F32>,
+                                     std::tuple<BF16, F32, F32, F32, BF16, F32, F32>,
+                                     std::tuple<F64, F64, F64, F64, F64, F64, F64>>;
+TYPED_TEST_SUITE(TestBatchNormBwdRank4, KernelTypes);
+// nhwc
+TYPED_TEST(TestBatchNormBwdRank4, nhwc)
+{
+    this->reduceDims = {0, 1, 2};
+    this->template Run<3>();
+}
+// nchw
+TYPED_TEST(TestBatchNormBwdRank4, nchw)
+{
+    this->reduceDims = {0, 2, 3};
+    this->template Run<3>();
+}
--- a/test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>
-#include "profiler/include/profile_batchnorm_forward_impl.hpp"
+#include "profiler/profile_batchnorm_forward_impl.hpp"
 using F16  = ck::half_t;
 using F32  = float;

--- a/test/convnd_bwd_data/convnd_bwd_data.cpp
+++ b/test/convnd_bwd_data/convnd_bwd_data.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>
-#include "profiler/include/profile_conv_bwd_data_impl.hpp"
+#include "profiler/profile_conv_bwd_data_impl.hpp"
 template <typename Tuple>
 class TestConvndBwdData : public ::testing::Test

--- a/test/convnd_fwd/convnd_fwd.cpp
+++ b/test/convnd_fwd/convnd_fwd.cpp
@@ -8,7 +8,7 @@
 #include <tuple>
 #include <gtest/gtest.h>
-#include "profiler/include/profile_conv_fwd_impl.hpp"
+#include "profiler/profile_conv_fwd_impl.hpp"
 template <typename Tuple>
 class TestConvndFwd : public ::testing::Test

--- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
+++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "gtest/gtest.h"
-#include "profiler/include/profile_elementwise_layernorm_impl.hpp"
+#include "profiler/profile_elementwise_layernorm_impl.hpp"
 using F16 = ck::half_t;
 using F32 = float;

--- a/test/gemm_reduce/gemm_reduce_fp16.cpp
+++ b/test/gemm_reduce/gemm_reduce_fp16.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
-#include "profiler/include/profile_gemm_reduce_impl.hpp"
+#include "profiler/profile_gemm_reduce_impl.hpp"
 int main()
 {

--- a/test/gemm_split_k/gemm_split_k.cpp
+++ b/test/gemm_split_k/gemm_split_k.cpp
@@ -226,9 +226,8 @@ int main(int argc, char* argv[])
    std::vector<gemmArgs> test_cases;
    if(argc == 1)
    {
-        test_cases = {{GemmMatrixLayout::MK_KN_MN, 3, 3, 3, 3, 3, 3, 1}};
+        test_cases = {{GemmMatrixLayout::MK_KN_MN, 1024, 1024, 1024, 1024, 1024, 1024, 2},
-        // JD: Populate with more and meaningful
+                      {GemmMatrixLayout::MK_KN_MN, 1024, 1024, 1024, 1024, 1024, 1024, 8}};
-        return 0;
    }
    else if(argc == 9)
    {
@@ -253,11 +252,10 @@ int main(int argc, char* argv[])
        printf("arg2 to 7: M, N, K, StrideA, StrideB, StrideC KBatch\n");
        return -1;
    }
+    bool error = false;
    for(const auto& kinder : test_cases)
    {
-        const auto res = test_gemm(kinder);
+        error |= test_gemm(kinder);
-        if(!res)
-            return -1;
    }
-    return 0;
+    return error ? 1 : 0;
 }
--- a/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
+++ b/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
@@ -9,7 +9,7 @@
 #include <gtest/gtest.h>
-#include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
+#include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"
 template <typename Tuple>
 class TestGroupedConvndBwdWeight : public ::testing::Test

--- a/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
+++ b/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
@@ -7,7 +7,7 @@
 #include <vector>
 #include <gtest/gtest.h>
-#include "profiler/include/profile_grouped_conv_fwd_impl.hpp"
+#include "profiler/profile_grouped_conv_fwd_impl.hpp"
 class TestGroupedConvNdFwd : public ::testing::Test
 {