Merge remote-tracking branch 'origin/develop' into contraction

6ef4e211 · Chao Liu · b0a2afb9 · 9e4429f9 · 6ef4e211 · 6ef4e211
Commit 6ef4e211 authored Jul 05, 2022 by Chao Liu
7 changed files
--- a/test/gemm_split_k/gemm_split_k.cpp
+++ b/test/gemm_split_k/gemm_split_k.cpp
@@ -11,11 +11,12 @@
 #include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

+#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
+
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/host_tensor/device_memory.hpp"
 #include "ck/library/host_tensor/host_tensor.hpp"
 #include "ck/library/host_tensor/host_tensor_generator.hpp"
-#include "ck/library/host_tensor/device_memory.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"

 #include "ck/library/host_tensor/host_gemm.hpp"
@@ -28,26 +29,6 @@ enum struct GemmMatrixLayout
    KM_NK_MN, // 3
 };

-using DeviceGemmNoOpPtr =
-    ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::PassThrough>;
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_gemm_instance {
-
-void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
-void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
-void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
-void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
-
-} // namespace device_gemm_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-
 template <typename T>
 static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
 {
@@ -79,6 +60,11 @@ struct gemmArgs

 int test_gemm(const gemmArgs& args)
 {
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
    bool a_row_major, b_row_major, c_row_major;

    switch(args.layout)
@@ -149,64 +135,79 @@ int test_gemm(const gemmArgs& args)
    b_device_buf.ToDevice(b_k_n.mData.data());
    c_device_buf.ToDevice(c_m_n_device_result.mData.data());

-    // add device GEMM instances
-    std::vector<DeviceGemmNoOpPtr> gemm_ptrs;
+    auto test = [&](auto a_layout, auto b_layout, auto c_layout) {
+        bool success = false;
+
+        using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<decltype(a_layout),
+                                                                        decltype(b_layout),
+                                                                        decltype(c_layout),
+                                                                        float,
+                                                                        float,
+                                                                        float,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+
+        const auto gemm_ptrs =
+            ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+                DeviceOp>::GetInstances();
+
+        for(auto& gemm_ptr : gemm_ptrs)
+        {
+            auto argument_ptr =
+                gemm_ptr->MakeArgumentPointer(static_cast<float*>(a_device_buf.GetDeviceBuffer()),
+                                              static_cast<float*>(b_device_buf.GetDeviceBuffer()),
+                                              static_cast<float*>(c_device_buf.GetDeviceBuffer()),
+                                              args.M,
+                                              args.N,
+                                              args.K,
+                                              args.StrideA,
+                                              args.StrideB,
+                                              args.StrideC,
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              args.KBatch);
+
+            auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
+
+            if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+                invoker_ptr->Run(argument_ptr.get());
+
+                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+                if(!check_out(c_m_n_host_result, c_m_n_device_result))
+                {
+                    success = false;
+                    break;
+                }
+                success = true;
+            }
+        }
+
+        return success;
+    };
+
+    bool success = false;

    if(args.layout == GemmMatrixLayout::MK_KN_MN)
    {
-        ck::tensor_operation::device::device_gemm_instance::
-            add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(gemm_ptrs);
+        success = test(Row{}, Row{}, Row{});
    }
    else if(args.layout == GemmMatrixLayout::MK_NK_MN)
    {
-        ck::tensor_operation::device::device_gemm_instance::
-            add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(gemm_ptrs);
+        success = test(Row{}, Col{}, Row{});
    }
    else if(args.layout == GemmMatrixLayout::KM_KN_MN)
    {
-        ck::tensor_operation::device::device_gemm_instance::
-            add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(gemm_ptrs);
+        success = test(Col{}, Row{}, Row{});
    }
    else
    {
-        ck::tensor_operation::device::device_gemm_instance::
-            add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(gemm_ptrs);
+        success = test(Col{}, Col{}, Row{});
    }

-    bool success = false;
-    for(auto& gemm_ptr : gemm_ptrs)
-    {
-        auto argument_ptr =
-            gemm_ptr->MakeArgumentPointer(static_cast<float*>(a_device_buf.GetDeviceBuffer()),
-                                          static_cast<float*>(b_device_buf.GetDeviceBuffer()),
-                                          static_cast<float*>(c_device_buf.GetDeviceBuffer()),
-                                          args.M,
-                                          args.N,
-                                          args.K,
-                                          args.StrideA,
-                                          args.StrideB,
-                                          args.StrideC,
-                                          ck::tensor_operation::element_wise::PassThrough{},
-                                          ck::tensor_operation::element_wise::PassThrough{},
-                                          ck::tensor_operation::element_wise::PassThrough{},
-                                          args.KBatch);
-
-        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
-
-        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            invoker_ptr->Run(argument_ptr.get());
-
-            c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-
-            if(!check_out(c_m_n_host_result, c_m_n_device_result))
-            {
-                success = false;
-                break;
-            }
-            success = true;
-        }
-    }
    auto error_code = 0;
    if(success)
    {

--- a/test/grouped_gemm/grouped_gemm_fp16.cpp
+++ b/test/grouped_gemm/grouped_gemm_fp16.cpp
@@ -28,7 +28,7 @@ using DeviceGroupedGemmPtr_ = ck::tensor_operation::device::DeviceGroupedGemmPtr
 namespace ck {
 namespace tensor_operation {
 namespace device {
-namespace device_grouped_gemm_instance {
+namespace instance {
 void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
    std::vector<DeviceGroupedGemmPtr_>&);
 }
@@ -197,7 +197,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
 int main()
 {
    std::vector<DeviceGroupedGemmPtr_> groupedGemmPtrs;
-    ck::tensor_operation::device::device_grouped_gemm_instance::
+    ck::tensor_operation::device::instance::
        add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(groupedGemmPtrs);

    bool res = true;

--- a/test/softmax/CMakeLists.txt
+++ b/test/softmax/CMakeLists.txt
@@ -2,7 +2,10 @@ add_custom_target(test_softmax)

 add_gtest_executable(test_softmax_fp32 test_softmax_fp32.cpp)
 add_gtest_executable(test_softmax_fp16 test_softmax_fp16.cpp)
+add_gtest_executable(test_softmax_int8 test_softmax_int8.cpp)
 target_link_libraries(test_softmax_fp32 PRIVATE host_tensor)
 target_link_libraries(test_softmax_fp16 PRIVATE host_tensor)
+target_link_libraries(test_softmax_int8 PRIVATE host_tensor)
 add_dependencies(test_softmax test_softmax_fp32)
-add_dependencies(test_softmax test_softmax_fp16)
\ No newline at end of file
+add_dependencies(test_softmax test_softmax_fp16)
+add_dependencies(test_softmax test_softmax_int8)
\ No newline at end of file
--- a/test/softmax/test_softmax_fp16.cpp
+++ b/test/softmax/test_softmax_fp16.cpp
@@ -15,14 +15,19 @@ class TestSoftmaxFP16 : public ck::TestSoftmax<Tuple>
 // clang-format off
 using KernelTypes = ::testing::Types<
 // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<ck::half_t, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<4>>, // mixed precision
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
-    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>
    >;
 // clang-format on
 TYPED_TEST_SUITE(TestSoftmaxFP16, KernelTypes);

--- a/test/softmax/test_softmax_fp32.cpp
+++ b/test/softmax/test_softmax_fp32.cpp
@@ -15,14 +15,19 @@ class TestSoftmaxFP32 : public ck::TestSoftmax<Tuple>
 // clang-format off
 using KernelTypes = ::testing::Types<
 // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<float, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<8>>, // mixed precision
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
-    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>
    >;
 // clang-format on
 TYPED_TEST_SUITE(TestSoftmaxFP32, KernelTypes);

--- a/test/softmax/test_softmax_int8.cpp
+++ b/test/softmax/test_softmax_int8.cpp
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+template <typename Tuple>
+class TestSoftmaxINT8 : public ck::TestSoftmax<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
+    std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>
+    >;
+// clang-format on
+TYPED_TEST_SUITE(TestSoftmaxINT8, KernelTypes);
+TYPED_TEST(TestSoftmaxINT8, Test_INT8) { this->Run(); }
--- a/test/softmax/test_softmax_util.hpp
+++ b/test/softmax/test_softmax_util.hpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

+#pragma once
+
 #include <vector>
 #include <iostream>
 #include <gtest/gtest.h>
@@ -16,6 +18,18 @@

 namespace ck {

+template <typename Range>
+std::string serialize_range(const Range& range)
+{
+    std::stringstream ss;
+    for(auto& r : range)
+    {
+        ss << r << ", ";
+    }
+    std::string str = ss.str();
+    return std::string(str.begin(), str.end() - 2);
+}
+
 template <typename Tuple>
 class TestSoftmax : public ::testing::Test
 {
@@ -80,23 +94,43 @@ class TestSoftmax : public ::testing::Test
        auto argument_ptr    = device_instance.MakeArgumentPointer(i_in_lengths,
                                                                i_in_strides,
                                                                reduce_dims,
-                                                                alpha,
-                                                                beta,
+                                                                &alpha,
+                                                                &beta,
                                                                in_dev.GetDeviceBuffer(),
                                                                out_dev.GetDeviceBuffer());

        if(!device_instance.IsSupportedArgument(argument_ptr.get()))
        {
-            FAIL() << "Unsupported argument";
+            // std::cout << "Skipped due to unsupported argument: "
+            //           << "input lengths = [" << serialize_range(in_length) << "], "
+            //           << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
+            return;
        }

        auto invoker_ptr = device_instance.MakeInvokerPointer();
        invoker_ptr->Run(argument_ptr.get());

-        ref_instance_invoker_.Run({in, out_ref, alpha, beta, Rank, reduce_dims});
+        ref_instance_invoker_.Run({in, out_ref, alpha, beta, reduce_dims});

        out_dev.FromDevice(out.mData.data());
-        EXPECT_TRUE(ck::utils::check_err(out.mData, out_ref.mData));
+
+        bool pass;
+
+        if(std::is_same<InDataType, int8_t>::value)
+        {
+            EXPECT_TRUE(pass = ck::utils::check_err(
+                            out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1));
+        }
+        else
+        {
+            EXPECT_TRUE(pass = ck::utils::check_err(out.mData, out_ref.mData));
+        }
+
+        if(!pass)
+        {
+            FAIL() << "Failure in input lengths = [" << serialize_range(in_length) << "], "
+                   << "scaler = [" << alpha << ", " << beta << "].";
+        }
    }

    void Run()
@@ -105,13 +139,14 @@ class TestSoftmax : public ::testing::Test
        {
            for(auto scale : this->scales_)
            {
-                this->RunSingle(in_length, std::get<0>(scale), std::get<1>(scale));
+                this->RunSingle(in_length, scale[0], scale[1]);
            }
        }
    }

-    std::vector<std::vector<index_t>> in_lengths_ = {{1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}};
-    std::vector<std::tuple<AccDataType, AccDataType>> scales_ = {{1, 0}, {2, 2}, {0, 1}};
+    std::vector<std::vector<index_t>> in_lengths_ = {
+        {1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}, {4, 4, 2048}, {8, 1, 8192}};
+    std::vector<std::vector<AccDataType>> scales_ = {{1, 0}, {1, 1}, {0, 1}, {2, 2}};

    typename ReferenceInstance::Invoker ref_instance_invoker_;
 };