Update time op to more accurately get device time (#2104)

34b68ee4 · Paul Fultz II · GitHub · bc7db104 · 34b68ee4 · 34b68ee4
Unverified Commit 34b68ee4 authored Oct 11, 2023 by Paul Fultz II Committed by GitHub Oct 11, 2023
6 changed files
--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -185,8 +185,7 @@ struct compile_plan
            results.begin(), results.end(), std::back_inserter(times), [&](const auto& cr) {
                if(not cr.has_value())
                    return std::numeric_limits<double>::max();
-                return time_op(*ctx, cr->replace.code_object, to_shapes(cr->ins->inputs()), 20)
+                return time_op(*ctx, cr->replace.code_object, to_shapes(cr->ins->inputs()), 20);
-                    .first;
            });
        auto i = std::distance(times.begin(), std::min_element(times.begin(), times.end()));
        std::cout << "Fastest solution: " << config->solutions.at(i) << std::endl;

--- a/src/targets/gpu/driver/compile_op.cpp
+++ b/src/targets/gpu/driver/compile_op.cpp
@@ -38,10 +38,8 @@ struct compile_op : action<compile_op>
        context ctx;
        auto inputs = p.parse_shapes(v.at("inputs"));
        auto op     = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
-        auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
+        auto t      = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
-        std::cout << op << ": " << host_time << "ms";
+        std::cout << op << ": " << t << "ms";
-        if(device_time > 0)
-            std::cout << ", " << device_time << "ms";
        std::cout << std::endl;
    }
 };

--- a/src/targets/gpu/driver/run_op.cpp
+++ b/src/targets/gpu/driver/run_op.cpp
@@ -43,8 +43,8 @@ struct run_op : action<run_op>
        auto op = make_op(name);
        if(v.contains("fields"))
            op.from_value(v.at("fields"));
-        auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
+        auto t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
-        std::cout << op << ": " << host_time << "ms" << std::endl;
+        std::cout << op << ": " << t << "ms" << std::endl;
    }
 };

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -299,23 +299,6 @@ struct context
    any_ptr get_queue() { return get_stream().get(); }
-    void enable_perf_measurement(bool b = true)
-    {
-        if(b)
-        {
-            start_event = create_event_for_timing();
-            stop_event  = create_event_for_timing();
-            get_stream().record(start_event.get());
-            get_stream().record(stop_event.get());
-        }
-        else
-        {
-            start_event = nullptr;
-            stop_event  = nullptr;
-        }
-        measure_perf = b;
-    }
    std::pair<hipEvent_t, hipEvent_t> get_perf_events() const
    {
        if(measure_perf)
@@ -323,12 +306,12 @@ struct context
        return std::make_pair(nullptr, nullptr);
    }
-    float get_elapsed_ms() const
+    static float get_elapsed_ms(hipEvent_t start, hipEvent_t stop)
    {
        float result = 0;
-        if(start_event != nullptr and stop_event != nullptr)
+        if(start != nullptr and stop != nullptr)
        {
-            auto status = hipEventElapsedTime(&result, start_event.get(), stop_event.get());
+            auto status = hipEventElapsedTime(&result, start, stop);
            if(status != hipSuccess)
                MIGRAPHX_THROW("Failed hipEventElapsedTime: " + hip_error(status));
        }

--- a/src/targets/gpu/include/migraphx/gpu/time_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/time_op.hpp
@@ -32,7 +32,7 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-MIGRAPHX_GPU_EXPORT std::pair<double, double>
+MIGRAPHX_GPU_EXPORT double
 time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n = 100);
 } // namespace gpu

--- a/src/targets/gpu/time_op.cpp
+++ b/src/targets/gpu/time_op.cpp
@@ -41,8 +41,7 @@ std::vector<argument> generate_arguments(const std::vector<shape>& shapes, unsig
 }
 using milliseconds = std::chrono::duration<double, std::milli>;
-std::pair<double, double>
+double time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n)
-time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n)
 {
    // TODO: Use std::ref
@@ -51,21 +50,19 @@ time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n)
    auto output           = op.compute_shape(inputs);
    op.finalize(ctx, output, inputs);
    auto args = generate_arguments(inputs);
-    auto run  = [&] {
+    auto start = context::create_event_for_timing();
-        op.compute(ctx, output, args);
+    auto stop  = context::create_event_for_timing();
-        ctx.finish();
+    auto run   = [&] { op.compute(ctx, output, args); };
-    };
-    gctx.enable_perf_measurement();
    run();
-    double host_time   = 0.0;
+    gctx.get_stream().record(start.get());
-    double device_time = 0.0;
    for(auto i : range(n))
    {
        (void)i;
-        host_time += time<milliseconds>(run);
+        run();
-        device_time += gctx.get_elapsed_ms();
    }
-    return std::make_pair(host_time / n, device_time / n);
+    gctx.get_stream().record(stop.get());
+    gctx.finish();
+    return context::get_elapsed_ms(start.get(), stop.get()) / n;
 }
 } // namespace gpu