Merge branch 'develop' into fastsoftmax

6416f066 · Paul Fultz II · GitHub · 647d5dc5 · 97a1ed2d · 6416f066
Unverified Commit 6416f066 authored Sep 20, 2022 by Paul Fultz II Committed by GitHub Sep 20, 2022
20 changed files
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -138,16 +138,16 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
        std::size_t groups     = (n + local - 1) / local;
        std::size_t max_blocks = max_global / local;
        std::size_t nglobal    = std::min(max_blocks * over, groups) * local;
-        return nglobal;
+        return std::min(nglobal, n);
    };
 }
 std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
 {
-    size_t block_size = 128;
+    const std::size_t min_block_size  = 64;
-    while(block_size <= max_block_size and block_size <= n)
+    const std::size_t base_block_size = 32;
-        block_size *= 2;
+    auto block_size                   = (((n - 1) / base_block_size + 1)) * base_block_size;
-    return block_size / 2;
+    return std::min(std::max(min_block_size, block_size), max_block_size);
 }
 operation compile_hip_code_object(const std::string& content, hip_compile_options options)

--- a/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
@@ -131,7 +131,7 @@ struct hip_array
    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator!=(const hip_array& x, const hip_array& y)
    {
-        return !(x == y);
+        return not(x == y);
    }
    // This uses the product order rather than lexical order
    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator<(const hip_array& x, const hip_array& y)

--- a/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
@@ -117,12 +117,13 @@ template <class V, class F, class... Ts>
 void hip_visit_all_impl(const shape& s, F f, V&& v, Ts&&... xs)
 {
    std::initializer_list<migraphx::shape::type_t> types = {get_shape(xs).type()...};
-    if(!std::all_of(
+    if(not std::all_of(
           types.begin(), types.end(), [&](migraphx::shape::type_t t) { return t == s.type(); }))
        MIGRAPHX_THROW("Types must be the same");
    std::initializer_list<index_int> ranks = {
        static_cast<index_int>(get_shape(xs).lens().size())...};
-    if(!std::all_of(ranks.begin(), ranks.end(), [&](index_int r) { return r == s.lens().size(); }))
+    if(not std::all_of(
+           ranks.begin(), ranks.end(), [&](index_int r) { return r == s.lens().size(); }))
        MIGRAPHX_THROW("Ranks must be the same");
    visit_tensor_size(s.lens().size(), [&](auto ndim) {
        s.visit_type(hip_visitor([&](auto as) { v(f(xs, ndim, as)...); }));
@@ -134,7 +135,8 @@ void hip_visit_views_impl(const shape& s, F f, V&& v, Ts&&... xs)
 {
    std::initializer_list<index_int> ranks = {
        static_cast<index_int>(get_shape(xs).lens().size())...};
-    if(!std::all_of(ranks.begin(), ranks.end(), [&](index_int r) { return r == s.lens().size(); }))
+    if(not std::all_of(
+           ranks.begin(), ranks.end(), [&](index_int r) { return r == s.lens().size(); }))
        MIGRAPHX_THROW("Ranks must be the same");
    visit_tensor_size(s.lens().size(), [&](auto ndim) { v(f(xs, ndim)...); });
 }

--- a/src/targets/gpu/device/multinomial.cpp
+++ b/src/targets/gpu/device/multinomial.cpp
@@ -47,7 +47,7 @@ constexpr Iterator upper_bound(Iterator first, Iterator last, const T& value)
        it   = first;
        step = count / 2;
        std::advance(it, step);
-        if(!(value < *it))
+        if(not(value < *it))
        {
            first = ++it;
            count -= step + 1;

--- a/src/targets/gpu/driver/compile_op.cpp
+++ b/src/targets/gpu/driver/compile_op.cpp
@@ -38,8 +38,11 @@ struct compile_op : action<compile_op>
        context ctx;
        auto inputs = p.parse_shapes(v.at("inputs"));
        auto op     = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
-        double t    = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
+        auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
-        std::cout << op << ": " << t << "ms" << std::endl;
+        std::cout << op << ": " << host_time << "ms";
+        if(device_time > 0)
+            std::cout << ", " << device_time << "ms";
+        std::cout << std::endl;
    }
 };

--- a/src/targets/gpu/driver/include/migraphx/gpu/driver/perf.hpp
+++ b/src/targets/gpu/driver/include/migraphx/gpu/driver/perf.hpp
@@ -33,7 +33,8 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace driver {
-double time_op(context& ctx, operation op, const std::vector<shape>& inputs, int n = 100);
+std::pair<double, double>
+time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n = 100);
 } // namespace driver
 } // namespace gpu

--- a/src/targets/gpu/driver/perf.cpp
+++ b/src/targets/gpu/driver/perf.cpp
@@ -42,22 +42,31 @@ std::vector<argument> generate_arguments(const std::vector<shape>& shapes, unsig
 }
 using milliseconds = std::chrono::duration<double, std::milli>;
-double time_op(context& ctx, operation op, const std::vector<shape>& inputs, int n)
+std::pair<double, double>
+time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n)
 {
    // TODO: Use std::ref
-    migraphx::context gctx = ctx;
+    migraphx::context ctx = ictx;
+    auto& gctx            = any_cast<migraphx::gpu::context>(ctx);
    auto output           = op.compute_shape(inputs);
-    op.finalize(gctx, output, inputs);
+    op.finalize(ctx, output, inputs);
    auto args = generate_arguments(inputs);
    auto run  = [&] {
-        op.compute(gctx, output, args);
+        op.compute(ctx, output, args);
-        gctx.finish();
+        ctx.finish();
    };
+    gctx.enable_perf_measurement();
    run();
-    auto r   = range(n);
+    double host_time   = 0.0;
-    double t = std::accumulate(
+    double device_time = 0.0;
-        r.begin(), r.end(), double{0.0}, [&](auto x, auto) { return x + time<milliseconds>(run); });
+    for(auto i : range(n))
-    return t / n;
+    {
+        (void)i;
+        host_time += time<milliseconds>(run);
+        device_time += gctx.get_elapsed_ms();
+    }
+    return std::make_pair(host_time / n, device_time / n);
 }
 } // namespace driver

--- a/src/targets/gpu/driver/run_op.cpp
+++ b/src/targets/gpu/driver/run_op.cpp
@@ -43,8 +43,8 @@ struct run_op : action<run_op>
        auto op = make_op(name);
        if(v.contains("fields"))
            op.from_value(v.at("fields"));
-        double t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
+        auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
-        std::cout << op << ": " << t << "ms" << std::endl;
+        std::cout << op << ": " << host_time << "ms" << std::endl;
    }
 };

--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -61,13 +61,25 @@ struct mlir_conv
 MIGRAPHX_REGISTER_OP(mlir_conv);
 namespace {
+MIGRAPHX_PRED_MATCHER(is_mlir_conv, instruction_ref ins)
+{
+    if(ins->name() != "convolution")
+        return false;
+    value v    = ins->get_operator().to_value();
+    auto group = v.at("group").to<int>();
+    if(group != 1)
+        return false;
+    return true;
+}
 struct find_conv_pointwise
 {
    // Find a convolution followed by a pointwise operation.
    auto matcher() const
    {
        auto convolution =
-            match::skip(match::name("contiguous"))(match::name("convolution").bind("convolution"));
+            match::skip(match::name("contiguous"))(is_mlir_conv().bind("convolution"));
        return match::name("pointwise")(match::any_of[match::inputs()](convolution.bind("x")));
    }

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -26,7 +26,6 @@
 #include <migraphx/gpu/fuse_ops.hpp>
 #include <migraphx/matcher.hpp>
 #include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/clip.hpp>
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/oper.hpp>
@@ -50,8 +49,6 @@
 #include <migraphx/array.hpp>
 #include <migraphx/permutation.hpp>
 #include <migraphx/make_op.hpp>
-#include <migraphx/op/clip.hpp>
-#include <migraphx/op/contiguous.hpp>
 #include <cmath>
 #include <set>
@@ -262,7 +259,7 @@ struct hip_add_relu : binary_device<hip_add_relu, &device::add_relu>
 };
 MIGRAPHX_REGISTER_OP(hip_add_relu)
-struct hip_add_sigmoid : binary_device<hip_add_relu, &device::add_sigmoid>
+struct hip_add_sigmoid : binary_device<hip_add_sigmoid, &device::add_sigmoid>
 {
 };
 MIGRAPHX_REGISTER_OP(hip_add_sigmoid)
@@ -1036,7 +1033,7 @@ struct find_gemm_pointwise
        // const-fold input if not standard shape since rocblas can't handle it
        if(not c_ins->get_shape().standard())
        {
-            auto c = op::contiguous{};
+            auto c = make_op("contiguous");
            auto l = c.compute(c.compute_shape({c_ins->get_shape()}), {c_ins->eval()});
            c_ins  = m.add_literal(l.get_shape(), l.data());
        }

--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
@@ -112,7 +112,7 @@ void gemm_impl(context& ctx,
               bool compute_fp32)
 {
    const bool is_3inputs = (args.size() == 4);
-    if(!is_3inputs)
+    if(not is_3inputs)
    {
        beta = 0;
    }
@@ -176,8 +176,13 @@ void gemm_impl(context& ctx,
        auto num_matrices = std::accumulate(
            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
-        if(num_matrices == 1)
+        if(num_matrices == 1 or (num_matrices > 1 and get_batch_stride(args[1]) == 0))
        {
+            // If the batch dimension of B is broadcasted, then we can
+            // multiply m by the batch_size and use rocblas_gemm_ex
+            // instead of rocblas_gemm_strided_batched_ex.
+            m *= num_matrices;
            // the rocblas_gemm API handles inputs and output matrices as
            // column-major format. When doing a C = A * B, we actually do
            // C^T = (B^T) * (A^T). That is the reason we input args[1] as

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -244,6 +244,15 @@ struct context
        return hip_event_ptr{event};
    }
+    static hip_event_ptr create_event_for_timing()
+    {
+        hipEvent_t event;
+        auto status = hipEventCreate(&event);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to create event");
+        return hip_event_ptr{event};
+    }
    value to_value() const
    {
        value result;
@@ -267,10 +276,49 @@ struct context
    any_ptr get_queue() { return get_stream().get(); }
+    void enable_perf_measurement(bool b = true)
+    {
+        if(b)
+        {
+            start_event = create_event_for_timing();
+            stop_event  = create_event_for_timing();
+            get_stream().record(start_event.get());
+            get_stream().record(stop_event.get());
+        }
+        else
+        {
+            start_event = nullptr;
+            stop_event  = nullptr;
+        }
+        measure_perf = b;
+    }
+    std::pair<hipEvent_t, hipEvent_t> get_perf_events() const
+    {
+        if(measure_perf)
+            return std::make_pair(start_event.get(), stop_event.get());
+        return std::make_pair(nullptr, nullptr);
+    }
+    float get_elapsed_ms() const
+    {
+        float result = 0;
+        if(start_event != nullptr and stop_event != nullptr)
+        {
+            auto status = hipEventElapsedTime(&result, start_event.get(), stop_event.get());
+            if(status != hipSuccess)
+                MIGRAPHX_THROW("Failed hipEventElapsedTime: " + hip_error(status));
+        }
+        return result;
+    }
    private:
    // TODO: Make this a vector to support multiple devices
    std::shared_ptr<hip_device> current_device;
    std::vector<shared<hip_event_ptr>> events;
+    bool measure_perf                 = false;
+    shared<hip_event_ptr> start_event = nullptr;
+    shared<hip_event_ptr> stop_event  = nullptr;
 };
 inline void migraphx_to_value(value& v, const context& ctx) { v = ctx.to_value(); }

--- a/src/targets/gpu/include/migraphx/gpu/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gather.hpp
@@ -27,7 +27,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/reflect.hpp>
 #include <migraphx/op/gather.hpp>
-#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -37,6 +37,8 @@ namespace gpu {
 struct context;
+std::string hip_error(int error);
 argument allocate_gpu(const shape& s, bool host = false);
 argument register_on_gpu(const argument& arg);

--- a/src/targets/gpu/include/migraphx/gpu/int8_conv_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/int8_conv_pack.hpp
@@ -25,7 +25,6 @@
 #define MIGRAPHX_GUARD_RTGLIB_INT8_CONV_PACK_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/op/quant_dot.hpp>
 #include <migraphx/config.hpp>
 #include <utility>

--- a/src/targets/gpu/include/migraphx/gpu/int8_gemm_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/int8_gemm_pack.hpp
@@ -25,7 +25,6 @@
 #define MIGRAPHX_GUARD_RTGLIB_INT8_GEMM_PACK_HPP
 #include <migraphx/argument.hpp>
-#include <migraphx/op/quant_dot.hpp>
 #include <migraphx/config.hpp>
 #include <utility>

--- a/src/targets/gpu/include/migraphx/gpu/kernel.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/kernel.hpp
@@ -50,17 +50,22 @@ struct kernel
    void launch(hipStream_t stream,
                std::size_t global,
                std::size_t local,
-                const std::vector<kernel_argument>& args) const;
+                const std::vector<kernel_argument>& args,
+                hipEvent_t start = nullptr,
+                hipEvent_t stop  = nullptr) const;
    void launch(hipStream_t stream,
                std::size_t global,
                std::size_t local,
-                std::vector<void*> args) const;
+                std::vector<void*> args,
+                hipEvent_t start = nullptr,
+                hipEvent_t stop  = nullptr) const;
-    auto launch(hipStream_t stream, std::size_t global, std::size_t local) const
+    template <class... Ts>
+    auto launch(hipStream_t stream, std::size_t global, std::size_t local, Ts... zs) const
    {
        return [=](auto&&... xs) {
-            launch(stream, global, local, std::vector<kernel_argument>{xs...});
+            launch(stream, global, local, std::vector<kernel_argument>{xs...}, zs...);
        };
    }

--- a/src/targets/gpu/include/migraphx/gpu/logsoftmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/logsoftmax.hpp
@@ -24,22 +24,10 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_LOGSOFTMAX_HPP
 #define MIGRAPHX_GUARD_RTGLIB_LOGSOFTMAX_HPP
-#include <migraphx/gpu/lowering.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/instruction.hpp>
 #include <migraphx/op/logsoftmax.hpp>
-#include <migraphx/generate.hpp>
+#include <migraphx/shape.hpp>
-#include <migraphx/shape_for_each.hpp>
+#include <migraphx/reflect.hpp>
-#include <migraphx/config.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/dfor.hpp>
-#include <migraphx/gpu/device/contiguous.hpp>
-#include <migraphx/gpu/device/add.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/context.hpp>
-#include <utility>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/include/migraphx/gpu/lrn.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/lrn.hpp
@@ -26,7 +26,7 @@
 #include <migraphx/shape.hpp>
 #include <migraphx/reflect.hpp>
-#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
 #include <migraphx/config.hpp>
-#include <migraphx/gpu/context.hpp>
+#include <string>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {