check in the initial GPU implementation of the int8 gemm.

e3950e2c · Shucai Xiao · ae21ecbf · e3950e2c · e3950e2c · e3950e2c
Commit e3950e2c authored May 06, 2019 by Shucai Xiao
6 changed files
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -32,6 +32,7 @@ add_library(migraphx_device
    device/pad.cpp
    device/gather.cpp
    device/sub.cpp
+    device/pack.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_clang_tidy_check(migraphx_device)

--- a/src/targets/gpu/device/pack.cpp
+++ b/src/targets/gpu/device/pack.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/pack.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void pack_a(hipStream_t stream,
+            const argument& result, const argument& arg)
+{
+    auto output_shape = result.get_shape();
+    auto dim_0 = output_shape.lens().size() - 2;
+    std::size_t ldb = output_shape.strides()[dim_0];
+    visit_all(result, arg) ([&](auto output, auto input) {
+        std::size_t nelements = output_shape.elements();
+        auto* out_ptr           = device_cast(output.data());
+        auto* in_ptr            = device_cast(input.data());
+        visit_tensor_size(output_shape.lens().size(), [&](auto out_dim) {
+            hip_tensor_descriptor<out_dim> desc(output_shape);
+            gs_launch(stream, nelements)([=](auto ii) {
+                const size_t nb = 4;
+                auto idx        = desc.multi(ii);
+                std::size_t i_m = idx[0];
+                std::size_t i_k = idx[1];
+                out_ptr[i_k % nb + (i_m + (i_k / nb) * ldb) * nb] = in_ptr[i_m + i_k * ldb];
+            });
+        });
+    });
+}
+
+void pack_b(hipStream_t stream,
+            const argument& result, const argument& arg) 
+{
+    auto output_shape = result.get_shape();
+    auto dim_1 = output_shape.lens().size() - 1;
+    std::size_t lda = output_shape.strides()[dim_1];
+    visit_all(result, arg) ([&](auto output, auto input) {
+        std::size_t nelements = output_shape.elements();
+        auto* out_ptr           = device_cast(output.data());
+        auto* in_ptr            = device_cast(input.data());
+        visit_tensor_size(output_shape.lens().size(), [&](auto out_dim) {
+            hip_tensor_descriptor<out_dim> desc(output_shape);
+            gs_launch(stream, nelements)([=](auto ii) {
+                const size_t nb = 4;
+                auto idx        = desc.multi(ii);
+                std::size_t i_n = idx[0];
+                std::size_t i_k = idx[1];
+                out_ptr[i_k % nb + (i_n + (i_k / nb) * lda) * nb] = in_ptr[i_n + i_k * lda];
+            });
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/include/migraphx/gpu/device/pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/pack.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_PACK_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_PACK_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void pack_a(hipStream_t stream,
+            const argument& result, const argument& arg);
+
+void pack_b(hipStream_t stream,
+            const argument& result, const argument& arg);
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_gemm.hpp
@@ -17,7 +17,16 @@ struct miopen_quant_gemm
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+};
+
+struct hip_pack
+{
+    std::string name() const { return "gpu::gemm_pack"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
 };

 } // namespace gpu

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -97,7 +97,6 @@ struct miopen_apply
        add_generic_op<hip_min>("min");

        add_extend_op<miopen_gemm, op::dot>("dot");
-        add_extend_op<miopen_quant_gemm, op::quant_dot>("quant_dot");
        add_extend_op<miopen_contiguous, op::contiguous>("contiguous");
        add_extend_op<hip_concat, op::concat>("concat");
        add_extend_op<miopen_softmax, op::softmax>("softmax");
@@ -110,6 +109,7 @@ struct miopen_apply
        add_quant_convolution_op();
        add_pooling_op();
        add_batch_norm_inference_op();
+        add_quant_gemm_op();
    }

    void apply()
@@ -263,6 +263,35 @@ struct miopen_apply
                                             output);
        });
    }
+
+    void add_quant_gemm_op()
+    {
+        apply_map.emplace("quant_gemm", [=](instruction_ref ins) {
+            auto&& op                         = any_cast<op::quant_dot>(ins->get_operator());
+            auto output                       = insert_allocation(ins, ins->get_shape());
+            std::vector<instruction_ref> refs = ins->inputs();
+            refs.push_back(output);
+
+            // Need another two buffers for packed data buffer 
+            auto shape_a = refs.at(0)->get_shape();
+            if (shape_a.transposed())
+            {
+                auto pack_a = insert_allocation(ins, shape_a);
+                refs.push_back(pack_a);
+                std::swap(refs.back(), refs.at(0));
+            }
+
+            auto shape_b = refs.at(1)->get_shape();
+            if (!shape_b.transposed())
+            {
+                auto pack_b = insert_allocation(ins, shape_b);
+                refs.push_back(pack_b);
+                std::swap(refs.back(), refs.at(1));
+            }
+
+            return prog->replace_instruction(ins, miopen_quant_gemm{op}, refs);
+        });
+    }
 };

 void lowering::apply(program& p) const { miopen_apply{&p, ctx}.apply(); }

--- a/src/targets/gpu/quant_gemm.cpp
+++ b/src/targets/gpu/quant_gemm.cpp
 #include <migraphx/gpu/quant_gemm.hpp>
+#include <migraphx/gpu/device/pack.hpp>
 #include <migraphx/gpu/context.hpp>

 namespace migraphx {
@@ -61,7 +62,36 @@ argument miopen_quant_gemm::compute(context& ctx,
                                    const shape& output_shape,
                                    const std::vector<argument>& args) const
 {
-    bool is_3inputs = (args.size() == 4);
+    // handling the packing of B MUST be before handling that for A
+    bool transa     = args[0].get_shape().transposed();
+    bool transb     = args[1].get_shape().transposed();
+    auto n_dim      = output_shape.lens().size();
+    auto dim_1      = n_dim - 1;
+    auto dim_0      = n_dim - 2;
+    rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
+    rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
+    rocblas_int ldc = args[2].get_shape().strides()[dim_0];
+
+    size_t addi_ref_num = 0;
+    if(!transb)
+    {
+        ++addi_ref_num;
+        const argument& arg_b = args[args.size() - 1];
+        // argument for B is the last one in the input argument vector
+        // use the algorithm to pack A
+        device::pack_a(ctx.get_stream().get(), args[1], arg_b);
+    }
+
+    // need to pack A in this scenario, use the algorithm to pack B in the 
+    // comment of the API
+    if(transa)
+    {
+        ++addi_ref_num;
+        const argument& arg_a = args[args.size() - 1 - addi_ref_num];
+        device::pack_b(ctx.get_stream().get(), args[0], arg_a);
+    }
+
+    bool is_3inputs = (args.size() - addi_ref_num == 4);
    int8_t beta     = 0;
    if(is_3inputs)
    {
@@ -71,42 +101,14 @@ argument miopen_quant_gemm::compute(context& ctx,
    auto a_lens = args[0].get_shape().lens();
    auto b_lens = args[1].get_shape().lens();
    output_shape.visit_type([&](auto as) {
-        auto n_dim      = output_shape.lens().size();
-        auto dim_1      = n_dim - 1;
-        auto dim_0      = n_dim - 2;
        auto alpha_r    = to_rocblas_type(as(op.alpha));
        auto beta_r     = to_rocblas_type(as(beta));
-        bool transa     = args[0].get_shape().transposed();
-        bool transb     = args[1].get_shape().transposed();
-        rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
-        rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
-        rocblas_int ldc = args[2].get_shape().strides()[dim_0];
        auto out_lens   = output_shape.lens();
        rocblas_int m   = out_lens[dim_0];
        rocblas_int n   = out_lens[dim_1];
        rocblas_int k   = args[0].get_shape().lens()[dim_1];
        auto to_pointer = [&](auto&& arg) { return to_rocblas_type(as.from(arg.data())); };
        assert(k % 4 == 0);
-        assert(!transa or (lda % 4 == 0));
-        assert(transb or (ldb % 4 == 0));
-
-        // need to pack B in thi scenario
-        if(!transb)
-        {
-            int nb = 4;
-            for(int i_m = 0; i_m < m; i_m++)
-            {
-                for(int i_k = 0; i_k < k; i_k++)
-                {
-                    A_packed[i_k % nb + (i_m + (i_k / nb) * lda) * nb] = A[i_m + i_k * lda];
-                }
-            }
-        }
-
-        // need to pack A in this scenario
-        if(transa)
-        {
-        }

        auto num_matrices = std::accumulate(
            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());