Merge remote-tracking branch 'origin/develop' into bwroblew/direct_loads

261d3267 · Bartlomiej Wroblewski · 2d5b22fe · f2398f61 · 261d3267 · 2d5b22fe
Commit 261d3267 authored Nov 14, 2023 by Bartlomiej Wroblewski
12 changed files
--- a/test/grouped_gemm/test_grouped_gemm_interface.cpp
+++ b/test/grouped_gemm/test_grouped_gemm_interface.cpp
@@ -108,6 +108,10 @@ TEST_F(TestGGemmSplitKInterface_MKNKMN, KLoops)
    // kloops % 2
    Ks = std::vector<int>{256, 512, 320, 768};
+    EXPECT_FALSE(
+        DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch));
+    Ks = std::vector<int>{256, 512, 384, 768};
    EXPECT_TRUE(
        DefaultGGemmInstance{}.IsSupported(Ms, Ns, Ks, StrideAs, StrideBs, StrideCs, kbatch));

--- a/test/normalization/CMakeLists.txt
+++ b/test/normalization/CMakeLists.txt
-add_custom_target(test_normalization)
-add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_layernorm2d_fp32 PRIVATE utility device_normalization_instance)
-  add_dependencies(test_normalization test_layernorm2d_fp32)
-endif()
-add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance)
-  add_dependencies(test_normalization test_groupnorm_fp32)
-endif()
-add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_layernorm2d_fp16 PRIVATE utility device_normalization_instance)
-  add_dependencies(test_normalization test_layernorm2d_fp16)
-endif()
-add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp)
-if(result EQUAL 0)
-  target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance)
-  add_dependencies(test_normalization test_groupnorm_fp16)
-endif()
--- a/test/normalization_fwd/CMakeLists.txt
+++ b/test/normalization_fwd/CMakeLists.txt
+add_custom_target(test_normalization_fwd)
+add_gtest_executable(test_layernorm2d_fwd_fp32 test_layernorm2d_fwd_fp32.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_layernorm2d_fwd_fp32 PRIVATE utility device_normalization_fwd_instance)
+  add_dependencies(test_normalization_fwd test_layernorm2d_fwd_fp32)
+endif()
+add_gtest_executable(test_groupnorm_fwd_fp32 test_groupnorm_fwd_fp32.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_groupnorm_fwd_fp32 PRIVATE utility device_normalization_fwd_instance)
+  add_dependencies(test_normalization_fwd test_groupnorm_fwd_fp32)
+endif()
+add_gtest_executable(test_layernorm2d_fwd_fp16 test_layernorm2d_fwd_fp16.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_layernorm2d_fwd_fp16 PRIVATE utility device_normalization_fwd_instance)
+  add_dependencies(test_normalization_fwd test_layernorm2d_fwd_fp16)
+endif()
+add_gtest_executable(test_layernorm4d_fwd_fp16 test_layernorm4d_fwd_fp16.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_layernorm4d_fwd_fp16 PRIVATE utility device_normalization_fwd_instance)
+  add_dependencies(test_normalization_fwd test_layernorm4d_fwd_fp16)
+endif()
+add_gtest_executable(test_groupnorm_fwd_fp16 test_groupnorm_fwd_fp16.cpp)
+if(result EQUAL 0)
+  target_link_libraries(test_groupnorm_fwd_fp16 PRIVATE utility device_normalization_fwd_instance)
+  add_dependencies(test_normalization_fwd test_groupnorm_fwd_fp16)
+endif()
--- a/test/normalization/test_groupnorm_fp16.cpp
+++ b/test/normalization/test_groupnorm_fp16.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "gtest/gtest.h"
-#include "profiler/profile_groupnorm_impl.hpp"
+#include "profiler/profile_groupnorm_fwd_impl.hpp"
 using F16 = ck::half_t;
 using F32 = float;

--- a/test/normalization/test_groupnorm_fp32.cpp
+++ b/test/normalization/test_groupnorm_fp32.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "gtest/gtest.h"
-#include "profiler/profile_groupnorm_impl.hpp"
+#include "profiler/profile_groupnorm_fwd_impl.hpp"
 using F16 = ck::half_t;
 using F32 = float;

--- a/test/normalization/test_layernorm2d_fp16.cpp
+++ b/test/normalization/test_layernorm2d_fp16.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "gtest/gtest.h"
-#include "profiler/profile_layernorm_impl.hpp"
+#include "profiler/profile_layernorm_fwd_impl.hpp"
 using F16 = ck::half_t;
 using F32 = float;

--- a/test/normalization/test_layernorm2d_fp32.cpp
+++ b/test/normalization/test_layernorm2d_fp32.cpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 #include "gtest/gtest.h"
-#include "profiler/profile_layernorm_impl.hpp"
+#include "profiler/profile_layernorm_fwd_impl.hpp"
 using F16 = ck::half_t;
 using F32 = float;

--- a/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
+++ b/test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "profiler/profile_layernorm_fwd_impl.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+template <typename Tuple>
+class TestLayernorm4d : public ::testing::Test
+{
+    protected:
+    using XDataType              = std::tuple_element_t<0, Tuple>;
+    using GammaDataType          = std::tuple_element_t<1, Tuple>;
+    using BetaDataType           = std::tuple_element_t<2, Tuple>;
+    using ComputeDataType        = std::tuple_element_t<3, Tuple>;
+    using YDataType              = std::tuple_element_t<4, Tuple>;
+    using SaveMeanInvStdDataType = std::tuple_element_t<5, Tuple>;
+    void Run()
+    {
+        // [N, D], reduce D
+        std::vector<std::vector<ck::index_t>> lengths = {
+            {1, 1, 1, 1}, {7, 7, 7, 7}, {256, 16, 16, 8}};
+        for(auto length : lengths)
+        {
+            bool success = ck::profiler::profile_layernorm_impl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                SaveMeanInvStdDataType,
+                                                                true,
+                                                                4>(true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+using KernelTypes = ::testing::Types<
+    // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
+    std::tuple<F16, F16, F16, F32, F16, F32>>;
+TYPED_TEST_SUITE(TestLayernorm4d, KernelTypes);
+TYPED_TEST(TestLayernorm4d, Test_FP16) { this->Run(); }
--- a/test/transpose/CMakeLists.txt
+++ b/test/transpose/CMakeLists.txt
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+   add_gtest_executable(test_transpose test_transpose.cpp)
+   target_link_libraries(test_transpose PRIVATE utility device_transpose_instance)
+   set(target 1)
+ endif()
+endforeach()
--- a/test/transpose/test_transpose.cpp
+++ b/test/transpose/test_transpose.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <tuple>
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_transpose_util.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+template <typename Tuple>
+class TestTranspose : public ::testing::Test
+{
+};
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<      F16,       F16>,
+    std::tuple<      F32,       F32>
+    >;
+// clang-format on
+TYPED_TEST_SUITE(TestTranspose, KernelTypes);
+//#include "test_transpose_ut_cases.inc"
--- a/test/transpose/test_transpose_ut_cases.inc
+++ b/test/transpose/test_transpose_ut_cases.inc
+#pragma once
+TYPED_TEST(TestTranspose, Test1)
+{
+    // for 16, 8, 16, 32, 8
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    std::vector<index_t> lengths{16, 8, 16, 32, 8};
+    /**constexpr int N = 16;
+    constexpr int C = 8;
+    constexpr int D = 16;
+    constexpr int H = 32;
+    constexpr int W = 8;**/
+    this->Run();
+}
+TYPED_TEST(TestTranpose, Test2)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    std::vector<index_t> lengths{16, 8, 16, 32, 16};
+    /**constexpr int N = 16;
+    constexpr int C = 8;
+    constexpr int D = 16;
+    constexpr int H = 32;
+    constexpr int W = 8;**/
+    this->Run();
+}
--- a/test/transpose/test_transpose_util.hpp
+++ b/test/transpose/test_transpose_util.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/data_type.hpp"
+#include "profiler/profile_transpose_impl.hpp"
+namespace ck {
+namespace test {
+template <typename Tuple>
+class TestTranspose : public testing::Test
+{
+    using F32 = float;
+    protected:
+    using ADataType = std::tuple_element_t<0, Tuple>;
+    using BDataType = std::tuple_element_t<1, Tuple>;
+    public:
+    static constexpr bool verify_              = true;
+    static constexpr int init_method_          = 1; // decimal value initialization
+    static constexpr bool log_                 = false;
+    static constexpr bool bench_               = false; // measure kernel performance
+    std::vector<std::vector<index_t>> lengths_ = {{16, 32, 16, 32, 16}, {16, 8, 16, 32, 8}};
+    void Run()
+    {
+        for(auto length : this->lengths_)
+        {
+            this->RunSingle(length);
+        }
+    }
+    void RunSingle()
+    {
+        bool pass = ck::profiler::profile_transpose_impl<ADataType, BDataType, 5>(
+            verify_, init_method_, log_, bench_, lengths_);
+        EXPECT_TRUE(pass);
+    }
+};
+} // namespace test
+} // namespace ck