merge develop

1dbdab56 · Jing Zhang · d2e49b23 · bac7df8f · 1dbdab56 · 1dbdab56
Commit 1dbdab56 authored Aug 18, 2022 by Jing Zhang
12 changed files
--- a/profiler/include/profile_normalization_impl.hpp
+++ b/profiler/include/profile_normalization_impl.hpp
@@ -36,7 +36,6 @@ namespace profiler {

 enum struct NormType
 {
-    LAYERNORM,
    BATCHNORM,
    SOFTMAX,
 };

--- a/profiler/src/profile_layernorm.cpp
+++ b/profiler/src/profile_layernorm.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/include/profile_layernorm_impl.hpp"
+
+using ck::index_t;
+
+struct LayernormArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {
+        {"length", {}}, {"strideXY", {}}, {"strideGamma", {}}, {"strideBeta", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+void print_help_layernorm()
+{
+    std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
+              << "arg2: verification (0: no; 1: yes)\n"
+              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg4: print tensor value (0: no; 1: yes)\n"
+              << "arg5: time kernel (0=n0, 1=yes)\n"
+              << "--length: tensor extents (e.g, --length 1024 1024) \n"
+              << "--strideXY: tensor strides (e.g, --strideXY 1024 1)\n"
+              << "--strideGamma: tensor strides (e.g, --strideGamma 1)\n"
+              << "--strideBeta: tensor strides (e.g, --strideBeta 1)\n"
+              << std::endl;
+}
+
+int profile_layernorm(int argc, char* argv[])
+{
+    if(argc <= 2)
+    {
+        print_help_layernorm();
+        return 0;
+    }
+
+    LayernormArgParser arg_parser;
+
+    // short unnamed options
+    const ck::DataTypeEnum data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
+    const bool do_verification       = std::stoi(argv[3]);
+    const int init_method            = std::stoi(argv[4]);
+    const bool do_log                = std::stoi(argv[5]);
+    const bool time_kernel           = std::stoi(argv[6]);
+
+    // parse the long options
+    arg_parser(argc, argv);
+    const std::vector<index_t> length      = arg_parser.long_opts["length"];
+    const std::vector<index_t> strideXY    = arg_parser.long_opts["strideXY"];
+    const std::vector<index_t> strideGamma = arg_parser.long_opts["strideGamma"];
+    const std::vector<index_t> strideBeta  = arg_parser.long_opts["strideBeta"];
+
+    using F16          = ck::half_t;
+    using F32          = float;
+    constexpr int rank = 2;
+
+    if(data_type == ck::DataTypeEnum::Half)
+    {
+        ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, rank>(do_verification,
+                                                                            init_method,
+                                                                            do_log,
+                                                                            time_kernel,
+                                                                            length,
+                                                                            strideXY,
+                                                                            strideGamma,
+                                                                            strideBeta);
+    }
+    else if(data_type == ck::DataTypeEnum::Float)
+    {
+        ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, rank>(do_verification,
+                                                                            init_method,
+                                                                            do_log,
+                                                                            time_kernel,
+                                                                            length,
+                                                                            strideXY,
+                                                                            strideGamma,
+                                                                            strideBeta);
+    }
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+// hijack main() for quick debugging
+// int main(int argc, char* argv[])
+// {
+//     profile_layernorm(argc, argv);
+//     return 0;
+// }
--- a/profiler/src/profile_normalization.cpp
+++ b/profiler/src/profile_normalization.cpp
@@ -13,8 +13,7 @@ using ck::profiler::NormType;

 struct ArgParser
 {
-    std::unordered_map<std::string, NormType> norm_dict = {{"layernorm", NormType::LAYERNORM},
-                                                           {"batchnorm", NormType::BATCHNORM},
+    std::unordered_map<std::string, NormType> norm_dict = {{"batchnorm", NormType::BATCHNORM},
                                                           {"softmax", NormType::SOFTMAX}};

    std::unordered_map<std::string, std::vector<int>> long_opts = {

--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -19,6 +19,7 @@ int profile_conv_bwd_data(int, char*[]);
 int profile_conv_bwd_weight(int, char*[]);
 int profile_grouped_conv_fwd(int, char*[]);
 int profile_normalization(int, char*[]);
+int profile_layernorm(int, char*[]);
 int profile_reduce(int, char*[]);

 static void print_helper_message()
@@ -115,11 +116,14 @@ int main(int argc, char* argv[])
    {
        return profile_reduce(argc, argv);
    }
-    else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "layernorm") == 0 ||
-            strcmp(argv[1], "softmax") == 0)
+    else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "softmax") == 0)
    {
        return profile_normalization(argc, argv);
    }
+    else if(strcmp(argv[1], "layernorm") == 0)
+    {
+        return profile_layernorm(argc, argv);
+    }
    else
    {
        print_helper_message();

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -40,6 +40,8 @@ add_subdirectory(gemm_split_k)
 add_subdirectory(gemm_reduce)
 add_subdirectory(batched_gemm)
 add_subdirectory(batched_gemm_reduce)
+add_subdirectory(batched_gemm_gemm)
+add_subdirectory(batched_gemm_softmax_gemm)
 add_subdirectory(grouped_gemm)
 add_subdirectory(reduce)
 add_subdirectory(convnd_fwd)

--- a/test/batched_gemm_gemm/CMakeLists.txt
+++ b/test/batched_gemm_gemm/CMakeLists.txt
+add_custom_target(test_batched_gemm_gemm)
+
+add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
+target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
+add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
\ No newline at end of file
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "test_batched_gemm_gemm_util.hpp"
+
+template <typename Tuple>
+class TestBatchedGemmGemmFP16 : public TestBatchedGemmGemm<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<F16, F16, F16, F16, Row, Col, Row, Row>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestBatchedGemmGemmFP16, KernelTypes);
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16) { this->Run(); }
+
+TYPED_TEST(TestBatchedGemmGemmFP16, DISABLED_Bench_FP16)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {256, 256, 64, 64, 768},
+        {256, 256, 128, 128, 768},
+        {512, 512, 64, 64, 768},
+        {512, 512, 128, 128, 768},
+        {1024, 1024, 64, 64, 768},
+        {1024, 1024, 128, 128, 768},
+        {2048, 2048, 64, 64, 768},
+        {2048, 2048, 128, 128, 768},
+        {4096, 4096, 64, 64, 768},
+        {4096, 4096, 128, 128, 768},
+    };
+    this->bench_  = true;
+    this->verify_ = false;
+    this->Run();
+}
--- a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include <vector>
+#include "profiler/include/profile_batched_gemm_gemm_impl.hpp"
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+using F16 = ck::half_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <typename Tuple>
+struct TestBatchedGemmGemm : public ::testing::Test
+{
+    using ADataType  = std::tuple_element_t<0, Tuple>;
+    using B0DataType = std::tuple_element_t<1, Tuple>;
+    using B1DataType = std::tuple_element_t<2, Tuple>;
+    using CDataType  = std::tuple_element_t<3, Tuple>;
+    using ALayout    = std::tuple_element_t<4, Tuple>;
+    using B0Layout   = std::tuple_element_t<5, Tuple>;
+    using B1Layout   = std::tuple_element_t<6, Tuple>;
+    using CLayout    = std::tuple_element_t<7, Tuple>;
+
+    std::vector<std::vector<int>> lengths_ = {
+        {256, 256, 64, 64, 4},
+        {256, 256, 128, 128, 4},
+        {512, 512, 64, 64, 2},
+        {512, 512, 128, 128, 2},
+        {1024, 1024, 64, 64, 1},
+        {1024, 1024, 128, 128, 1},
+    };
+    bool bench_  = false;
+    bool verify_ = true;
+
+    void RunSingle(int M, int N, int K, int O, int BatchCount)
+    {
+        bool pass = ck::profiler::profile_batched_gemm_gemm_impl<ADataType,
+                                                                 B0DataType,
+                                                                 B1DataType,
+                                                                 CDataType,
+                                                                 ALayout,
+                                                                 B0Layout,
+                                                                 B1Layout,
+                                                                 CLayout>(
+            verify_, 1, false, bench_, M, N, K, O, BatchCount);
+
+        EXPECT_TRUE(pass);
+    }
+
+    void Run()
+    {
+        for(auto lengths : this->lengths_)
+        {
+            int M          = lengths[0];
+            int N          = lengths[1];
+            int K          = lengths[2];
+            int O          = lengths[3];
+            int BatchCount = lengths[4];
+
+            this->RunSingle(M, N, K, O, BatchCount);
+        }
+    }
+};
--- a/test/batched_gemm_softmax_gemm/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm/CMakeLists.txt
+add_custom_target(test_batched_gemm_softmax_gemm)
+
+add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
+target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
+add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
\ No newline at end of file
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "test_batched_gemm_softmax_gemm_util.hpp"
+
+template <typename Tuple>
+class TestBatchedGemmSoftmaxGemmFP16 : public TestBatchedGemmSoftmaxGemm<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<F16, F16, F16, F16, Row, Col, Row, Row>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestBatchedGemmSoftmaxGemmFP16, KernelTypes);
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, Test_FP16) { this->Run(); }
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, DISABLED_Bench_FP16)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {256, 256, 64, 64, 768},
+        {256, 256, 128, 128, 768},
+        {512, 512, 64, 64, 768},
+        {512, 512, 128, 128, 768},
+        {1024, 1024, 64, 64, 768},
+        {1024, 1024, 128, 128, 768},
+        {2048, 2048, 64, 64, 768},
+        {2048, 2048, 128, 128, 768},
+        {4096, 4096, 64, 64, 768},
+        {4096, 4096, 128, 128, 768},
+    };
+    this->bench_  = true;
+    this->verify_ = false;
+    this->Run();
+}
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include <vector>
+#include "profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp"
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+using F16 = ck::half_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <typename Tuple>
+struct TestBatchedGemmSoftmaxGemm : public ::testing::Test
+{
+    using ADataType  = std::tuple_element_t<0, Tuple>;
+    using B0DataType = std::tuple_element_t<1, Tuple>;
+    using B1DataType = std::tuple_element_t<2, Tuple>;
+    using CDataType  = std::tuple_element_t<3, Tuple>;
+    using ALayout    = std::tuple_element_t<4, Tuple>;
+    using B0Layout   = std::tuple_element_t<5, Tuple>;
+    using B1Layout   = std::tuple_element_t<6, Tuple>;
+    using CLayout    = std::tuple_element_t<7, Tuple>;
+
+    std::vector<std::vector<int>> lengths_ = {
+        {256, 256, 64, 64, 4},
+        {256, 256, 128, 128, 4},
+        {512, 512, 64, 64, 2},
+        {512, 512, 128, 128, 2},
+        {1024, 1024, 64, 64, 1},
+        {1024, 1024, 128, 128, 1},
+    };
+    bool bench_  = false;
+    bool verify_ = true;
+
+    void RunSingle(int M, int N, int K, int O, int BatchCount)
+    {
+        bool pass = ck::profiler::profile_batched_gemm_softmax_gemm_impl<ADataType,
+                                                                         B0DataType,
+                                                                         B1DataType,
+                                                                         CDataType,
+                                                                         ALayout,
+                                                                         B0Layout,
+                                                                         B1Layout,
+                                                                         CLayout>(
+            verify_, 1, false, bench_, M, N, K, O, BatchCount);
+
+        EXPECT_TRUE(pass);
+    }
+
+    void Run()
+    {
+        for(auto lengths : this->lengths_)
+        {
+            int M          = lengths[0];
+            int N          = lengths[1];
+            int K          = lengths[2];
+            int O          = lengths[3];
+            int BatchCount = lengths[4];
+
+            this->RunSingle(M, N, K, O, BatchCount);
+        }
+    }
+};
--- a/test/layernorm/test_layernorm_util.hpp
+++ b/test/layernorm/test_layernorm_util.hpp
@@ -9,7 +9,7 @@

 #include "ck/ck.hpp"
 #include "ck/utility/number.hpp"
-#include "ck/tensor_operation/gpu/device/device_layernorm.hpp"
+#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"

 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -63,24 +63,24 @@ class TestLayernorm : public ::testing::Test
                                                                         Rank,
                                                                         NumReduceDim>;

-    using DeviceInstance = tensor_operation::device::DeviceLayernorm<XDataType,
-                                                                     GammaDataType,
-                                                                     BetaDataType,
-                                                                     AccDataType,
-                                                                     YDataType,
-                                                                     PassThrough,
-                                                                     Rank,
-                                                                     NumReduceDim,
-                                                                     BlockSize,
-                                                                     MThreadClusterSize,
-                                                                     KThreadClusterSize,
-                                                                     MThreadSliceSize,
-                                                                     KThreadSliceSize,
-                                                                     XYSrcVectorDim,
-                                                                     XSrcVectorSize,
-                                                                     GammaSrcVectorSize,
-                                                                     BetaSrcVectorSize,
-                                                                     YDstVectorSize>;
+    using DeviceInstance = tensor_operation::device::DeviceLayernormImpl<XDataType,
+                                                                         GammaDataType,
+                                                                         BetaDataType,
+                                                                         AccDataType,
+                                                                         YDataType,
+                                                                         PassThrough,
+                                                                         Rank,
+                                                                         NumReduceDim,
+                                                                         BlockSize,
+                                                                         MThreadClusterSize,
+                                                                         KThreadClusterSize,
+                                                                         MThreadSliceSize,
+                                                                         KThreadSliceSize,
+                                                                         XYSrcVectorDim,
+                                                                         XSrcVectorSize,
+                                                                         GammaSrcVectorSize,
+                                                                         BetaSrcVectorSize,
+                                                                         YDstVectorSize>;

    TestLayernorm() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {}

@@ -119,6 +119,7 @@ class TestLayernorm : public ::testing::Test
                                     gamma.mDesc.GetStrides().end()},
            std::vector<ck::index_t>{beta.mDesc.GetStrides().begin(),
                                     beta.mDesc.GetStrides().end()},
+            std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
            reduceDims,
            1e-4,
            x_dev.GetDeviceBuffer(),