Merge

7e297b13 · Paul · 86ea5e91 · aa7ff911 · 7e297b13 · 7e297b13
Commit 7e297b13 authored Jun 13, 2022 by Paul
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/device/topk.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/topk.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_TOPK_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_TOPK_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+argument topk_smallest(hipStream_t stream,
+                       const argument& val_res,
+                       const argument& ind_res,
+                       const argument& arg,
+                       int64_t k,
+                       int64_t axis);
+argument topk_largest(hipStream_t stream,
+                      const argument& val_res,
+                      const argument& ind_res,
+                      const argument& arg,
+                      int64_t k,
+                      int64_t axis);
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/where.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/where.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_WHERE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_WHERE_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void where(hipStream_t stream,
+           const argument& result,
+           const argument& arg0,
+           const argument& arg1,
+           const argument& arg2);
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/eliminate_workspace.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/eliminate_workspace.hpp
@@ -14,7 +14,7 @@ namespace gpu {
 struct eliminate_workspace
 {
    std::string name() const { return "eliminate_workspace"; }
-    void apply(module& p) const;
+    void apply(module& m) const;
 };
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
@@ -16,7 +16,7 @@ struct fuse_ops
    context* ctx   = nullptr;
    bool fast_math = true;
    std::string name() const { return "gpu::fuse_ops"; }
-    void apply(module& p) const;
+    void apply(module& m) const;
 };
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_GEMM_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GPU_GEMM_HPP
+#include <migraphx/errors.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/value.hpp>
 #include <migraphx/shape.hpp>
 #include <migraphx/reflect.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -15,17 +18,24 @@ namespace gpu {
 struct context;
+void blas_shape(const shape& s);
 template <class Op>
 struct rocblas_gemm
 {
    Op op;
+    float alpha         = 1;
+    float beta          = 0;
    bool int8_x4_format = true;
+    bool compute_fp32   = false;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack_join(migraphx::reflect(self.op, f),
-                         pack(f(self.int8_x4_format, "int8_x4_format")));
+                         pack(f(self.alpha, "alpha"),
+                              f(self.beta, "beta"),
+                              f(self.int8_x4_format, "int8_x4_format")));
    }
    std::string name() const
@@ -42,8 +52,30 @@ struct rocblas_gemm
        std::vector<shape> in_shapes(inputs);
        in_shapes.pop_back();
        check_shapes{in_shapes, *this}.not_broadcasted();
-        batch_not_transposed(inputs[0].strides());
+        blas_shape(inputs[0]);
-        batch_not_transposed(inputs[1].strides());
+        blas_shape(inputs[1]);
+        // if gemm and add are fused
+        if(in_shapes.size() > 2)
+        {
+            auto cmat_shape = in_shapes.back();
+            in_shapes.pop_back();
+            blas_shape(cmat_shape);
+            auto op_out_shape = op.compute_shape(in_shapes);
+            if(cmat_shape.lens() != op_out_shape.lens())
+            {
+                MIGRAPHX_THROW(this->name() + " : dimension mismatch, operand C: {" +
+                               to_string_range(cmat_shape.lens()) +
+                               "}, cannot add to operand A * B: {" +
+                               to_string_range(op_out_shape.lens()) + "}");
+            }
+            if(cmat_shape.type() != op_out_shape.type())
+            {
+                MIGRAPHX_THROW(this->name() + " : operand C type mismatch, operand C is of type: " +
+                               to_string(cmat_shape.type()) +
+                               ", it must be: " + to_string(op_out_shape.type()));
+            }
+            return op_out_shape;
+        }
        return op.compute_shape(in_shapes);
    }
@@ -51,30 +83,21 @@ struct rocblas_gemm
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
-        gemm(ctx, output_shape, args, op.alpha, op.beta, int8_x4_format);
+        if(this->name() == "gpu::gemm")
-        return args.back();
-    }
-    void batch_not_transposed(const std::vector<std::size_t>& strides) const
-    {
-        if(strides.size() <= 2)
-            return;
-        auto dim_0       = strides.size() - 2;
-        auto matrix_size = std::max(strides[dim_0], strides[dim_0 + 1]);
-        std::vector<std::size_t> batch(strides.begin(), strides.begin() + dim_0);
-        if(std::all_of(batch.begin(), batch.end(), [&](auto i) { return (i < matrix_size); }))
        {
-            MIGRAPHX_THROW("GPU_GEMM: matrix size and batch size {" + to_string_range(strides) +
+            gemm(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
-                           "} are transposed!");
        }
+        else
-        if(std::adjacent_find(batch.begin(), batch.end(), [&](auto i, auto j) {
-               return (i < j or i < matrix_size or j < matrix_size);
-           }) != batch.end())
        {
-            MIGRAPHX_THROW("GPU_GEMM: batch size {" + to_string_range(strides) +
+            gemm(ctx,
-                           "} is transposed!");
+                 output_shape,
+                 args,
+                 int32_t(alpha),
+                 int32_t(beta),
+                 int8_x4_format,
+                 compute_fp32);
        }
+        return args.back();
    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const

--- a/src/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
@@ -14,13 +14,15 @@ void gemm(context& ctx,
          const std::vector<argument>& args,
          float alpha,
          float beta,
-          bool int8_x4_format);
+          bool int8_x4_format,
+          bool compute_fp32);
 void gemm(context& ctx,
          const shape& output_shape,
          const std::vector<argument>& args,
          int32_t alpha,
          int32_t beta,
-          bool int8_x4_format);
+          bool int8_x4_format,
+          bool compute_fp32);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/loop.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/loop.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_LOOP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_LOOP_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/op/loop.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+struct hip_loop
+{
+    op::loop op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "gpu::loop"; }
+    shape compute_shape(std::vector<shape> inputs, std::vector<module_ref> mods) const;
+    argument
+    compute(context& ctx,
+            const shape& output_shape,
+            const std::vector<argument>& args,
+            const std::vector<module_ref>& mods,
+            const std::function<std::vector<argument>(
+                module_ref&, const std::unordered_map<std::string, argument>&)>& run) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -9,6 +9,8 @@
 #include <miopen/miopen.h>
 #include <migraphx/config.hpp>
+#include <sstream>
 #ifdef HAS_FIND_MODE_API
 extern "C" miopenStatus_t miopenHiddenSetConvolutionFindMode(miopenConvolutionDescriptor_t convDesc,
                                                             int findMode);
@@ -132,12 +134,16 @@ inline convolution_descriptor make_deconv(const T& op)
 inline pooling_descriptor make_pooling(const migraphx::op::pooling& op)
 {
    miopenPoolingMode_t mode;
-    if(op.mode == "max")
+    if(op.mode == op::pooling_mode::max)
        mode = miopenPoolingMax;
-    else if(op.mode == "average")
+    else if(op.mode == op::pooling_mode::average)
        mode = miopenPoolingAverage;
    else
-        MIGRAPHX_THROW("Unknown mode for pooling: " + op.mode);
+    {
+        std::stringstream ss("Unknown mode for pooling: ");
+        ss << op.mode;
+        MIGRAPHX_THROW(ss.str());
+    }
    auto p = make_obj<pooling_descriptor>(&miopenCreatePoolingDescriptor);
    int kdims = op.kdims();

--- a/src/targets/gpu/include/migraphx/gpu/multinomial.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/multinomial.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_MULTINOMIAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_MULTINOMIAL_HPP
+#include <migraphx/op/multinomial.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+struct hip_multinomial
+{
+    op::multinomial op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "gpu::multinomial"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/nonzero.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/nonzero.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_NONZERO_HPP
+#define MIGRAPHX_GUARD_RTGLIB_NONZERO_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/op/nonzero.hpp>
+#include <migraphx/gpu/miopen.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+struct hip_nonzero
+{
+    op::nonzero op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "gpu::nonzero"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/prefix_scan_sum.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/prefix_scan_sum.hpp
@@ -40,9 +40,8 @@ struct hip_prefix_scan_sum : oper<hip_prefix_scan_sum>
    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
    {
-        if(op.exclusive or op.reverse)
+        device::prefix_scan_sum(
-            MIGRAPHX_THROW("Exclusive and reverse scan not supported");
+            ctx.get_stream().get(), args[1], args[0], op.axis, op.exclusive, op.reverse);
-        device::prefix_scan_sum(ctx.get_stream().get(), args[1], args[0], op.axis);
        return args[1];
    }

--- a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+#ifndef MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
+#define MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/context.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+namespace gpu {
+struct prefuse_ops
+{
+    std::string name() const { return "gpu::prefuse_ops"; }
+    void apply(module& m) const;
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
--- a/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
@@ -2,6 +2,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
 #include <migraphx/shape.hpp>
+#include <migraphx/reflect.hpp>
 #include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/gpu/miopen.hpp>
@@ -14,6 +15,7 @@ struct context;
 struct miopen_quant_convolution
 {
    op::quant_convolution op;
+    bool int8_x4_format = false;
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};
    miopenHandle_t handle = nullptr;
@@ -22,7 +24,8 @@ struct miopen_quant_convolution
    static auto reflect(Self& self, F f)
    {
        // TODO: Add algo
-        return op::quant_convolution::reflect(self.op, f);
+        return pack_join(migraphx::reflect(self.op, f),
+                         pack(f(self.int8_x4_format, "int8_x4_format")));
    }
    std::string name() const { return "gpu::quant_convolution"; }

--- a/src/targets/gpu/include/migraphx/gpu/scatter.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/scatter.hpp
@@ -3,7 +3,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/reflect.hpp>
-#include <migraphx/op/scatter.hpp>
+#include <migraphx/op/scatter_none.hpp>
 #include <migraphx/gpu/miopen.hpp>
 namespace migraphx {
@@ -14,7 +14,9 @@ struct context;
 struct hip_scatter
 {
-    op::scatter op;
+    // scatter_none is an exact replacement for previous op::scatter,
+    // renamed to match an Onnx option.  Don't use base class op::scatter
+    op::scatter_none op;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -22,7 +24,7 @@ struct hip_scatter
        return migraphx::reflect(self.op, f);
    }
-    std::string name() const { return "gpu::scatter"; }
+    std::string name() const { return "gpu::scatter_none"; }
    shape compute_shape(std::vector<shape> inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;

--- a/src/targets/gpu/include/migraphx/gpu/schedule_model.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/schedule_model.hpp
@@ -17,9 +17,9 @@ struct schedule_model
 {
    std::size_t streams = 0;
    std::size_t concurrency() const;
-    void sched(module& p, instruction_ref ins, std::size_t n) const;
+    void sched(module& m, instruction_ref ins, std::size_t n) const;
-    void wait(module& p, instruction_ref ins, std::size_t wait_id) const;
+    void wait(module& m, instruction_ref ins, std::size_t wait_id) const;
-    void record(module& p, instruction_ref ins, std::size_t wait_id) const;
+    void record(module& m, instruction_ref ins, std::size_t wait_id) const;
    std::size_t weight(const operation& op) const;
 };

--- a/src/targets/gpu/include/migraphx/gpu/sync_device.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/sync_device.hpp
@@ -15,7 +15,7 @@ namespace gpu {
 struct sync_device
 {
    std::string name() const { return "sync_device"; }
-    void apply(module& p) const;
+    void apply(module& m) const;
 };
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/topk.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/topk.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_TOPK_HPP
+#define MIGRAPHX_GUARD_RTGLIB_TOPK_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/op/topk.hpp>
+#include <migraphx/gpu/miopen.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+struct hip_topk
+{
+    op::topk op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "gpu::topk"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/where.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/where.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_WHERE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_WHERE_HPP
+#include <migraphx/gpu/oper.hpp>
+#include <migraphx/gpu/device/where.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct hip_where : ternary_device<hip_where, device::where>
+{
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(4).same_dims();
+        auto s1 = inputs.at(1);
+        auto s2 = inputs.at(2);
+        if(s1 == s2 and s1.packed())
+        {
+            return s1;
+        }
+        else if(s1.packed() != s2.packed())
+        {
+            return s1.packed() ? s1 : s2;
+        }
+        else if(s1.broadcasted() != s2.broadcasted())
+        {
+            return s1.broadcasted() ? s2.with_lens(s1.lens()) : s1.with_lens(s1.lens());
+        }
+        else
+        {
+            return {s1.type(), s1.lens()};
+        }
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/write_literals.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/write_literals.hpp
@@ -14,7 +14,7 @@ struct write_literals
    context* ctx = nullptr;
    std::string name() const { return "gpu::write_literals"; }
-    void apply(module& p) const;
+    void apply(module& m) const;
 };
 } // namespace gpu

--- a/src/targets/gpu/int8_conv_pack.cpp
+++ b/src/targets/gpu/int8_conv_pack.cpp
@@ -5,10 +5,25 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
+shape pack_int8_shape(const shape& s)
+{
+    if(s.type() != shape::int8_type)
+    {
+        MIGRAPHX_THROW("PACK_INT8_ARGS: only process int8_type");
+    }
+    auto lens    = s.lens();
+    auto strides = s.strides();
+    lens[1]      = (lens[1] + 3) / 4 * 4;
+    strides[0]   = strides[1] * lens[1];
+    return {s.type(), lens, strides};
+}
 shape miopen_int8_conv_pack::compute_shape(const std::vector<shape>& inputs) const
 {
    check_shapes{{inputs.at(0)}, *this}.has(1).standard();
-    return inputs.at(0);
+    return pack_int8_shape(inputs.at(0));
 }
 argument