Commit db0301d7 authored by Paul's avatar Paul
Browse files

Merge branch 'perk-kernel' into bert-opt2

parents 4dc2f1a1 6ee87f92
......@@ -51,7 +51,8 @@ code_object_op::compute(context& ctx, const shape&, const std::vector<argument>&
std::vector<void*> kargs(args.size());
std::transform(
args.begin(), args.end(), kargs.begin(), [](const argument& a) { return a.data(); });
k.launch(ctx.get_stream().get(), global, local, std::move(kargs));
auto [start, stop] = ctx.get_perf_events();
k.launch(ctx.get_stream().get(), global, local, std::move(kargs), start, stop);
return args[get_output_arg(args.size())];
}
void code_object_op::finalize(context&, const shape&, const std::vector<shape>&)
......
......@@ -38,8 +38,11 @@ struct compile_op : action<compile_op>
context ctx;
auto inputs = p.parse_shapes(v.at("inputs"));
auto op = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
double t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << t << "ms" << std::endl;
auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << host_time << "ms";
if(device_time > 0)
std::cout << ", " << device_time << "ms";
std::cout << std::endl;
}
};
......
......@@ -33,7 +33,8 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace driver {
double time_op(context& ctx, operation op, const std::vector<shape>& inputs, int n = 100);
std::pair<double, double>
time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n = 100);
} // namespace driver
} // namespace gpu
......
......@@ -42,22 +42,31 @@ std::vector<argument> generate_arguments(const std::vector<shape>& shapes, unsig
}
using milliseconds = std::chrono::duration<double, std::milli>;
double time_op(context& ctx, operation op, const std::vector<shape>& inputs, int n)
std::pair<double, double>
time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n)
{
// TODO: Use std::ref
migraphx::context gctx = ctx;
auto output = op.compute_shape(inputs);
op.finalize(gctx, output, inputs);
migraphx::context ctx = ictx;
auto& gctx = any_cast<migraphx::gpu::context>(ctx);
auto output = op.compute_shape(inputs);
op.finalize(ctx, output, inputs);
auto args = generate_arguments(inputs);
auto run = [&] {
op.compute(gctx, output, args);
gctx.finish();
op.compute(ctx, output, args);
ctx.finish();
};
gctx.enable_perf_measurement();
run();
auto r = range(n);
double t = std::accumulate(
r.begin(), r.end(), double{0.0}, [&](auto x, auto) { return x + time<milliseconds>(run); });
return t / n;
double host_time = 0.0;
double device_time = 0.0;
for(auto i : range(n))
{
(void)i;
host_time += time<milliseconds>(run);
device_time += gctx.get_elapsed_ms();
}
return std::make_pair(host_time / n, device_time / n);
}
} // namespace driver
......
......@@ -43,8 +43,8 @@ struct run_op : action<run_op>
auto op = make_op(name);
if(v.contains("fields"))
op.from_value(v.at("fields"));
double t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << t << "ms" << std::endl;
auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << host_time << "ms" << std::endl;
}
};
......
......@@ -244,6 +244,15 @@ struct context
return hip_event_ptr{event};
}
static hip_event_ptr create_event_for_timing()
{
hipEvent_t event;
auto status = hipEventCreate(&event);
if(status != hipSuccess)
MIGRAPHX_THROW("Failed to create event");
return hip_event_ptr{event};
}
value to_value() const
{
value result;
......@@ -267,10 +276,49 @@ struct context
any_ptr get_queue() { return get_stream().get(); }
void enable_perf_measurement(bool b = true)
{
if(b)
{
start_event = create_event_for_timing();
stop_event = create_event_for_timing();
get_stream().record(start_event.get());
get_stream().record(stop_event.get());
}
else
{
start_event = nullptr;
stop_event = nullptr;
}
measure_perf = b;
}
std::pair<hipEvent_t, hipEvent_t> get_perf_events() const
{
if(measure_perf)
return std::make_pair(start_event.get(), stop_event.get());
return std::make_pair(nullptr, nullptr);
}
float get_elapsed_ms() const
{
float result = 0;
if(start_event != nullptr and stop_event != nullptr)
{
auto status = hipEventElapsedTime(&result, start_event.get(), stop_event.get());
if(status != hipSuccess)
MIGRAPHX_THROW("Failed hipEventElapsedTime: " + hip_error(status));
}
return result;
}
private:
// TODO: Make this a vector to support multiple devices
std::shared_ptr<hip_device> current_device;
std::vector<shared<hip_event_ptr>> events;
bool measure_perf = false;
shared<hip_event_ptr> start_event = nullptr;
shared<hip_event_ptr> stop_event = nullptr;
};
inline void migraphx_to_value(value& v, const context& ctx) { v = ctx.to_value(); }
......
......@@ -37,6 +37,8 @@ namespace gpu {
struct context;
std::string hip_error(int error);
argument allocate_gpu(const shape& s, bool host = false);
argument register_on_gpu(const argument& arg);
......
......@@ -50,17 +50,22 @@ struct kernel
void launch(hipStream_t stream,
std::size_t global,
std::size_t local,
const std::vector<kernel_argument>& args) const;
const std::vector<kernel_argument>& args,
hipEvent_t start = nullptr,
hipEvent_t stop = nullptr) const;
void launch(hipStream_t stream,
std::size_t global,
std::size_t local,
std::vector<void*> args) const;
std::vector<void*> args,
hipEvent_t start = nullptr,
hipEvent_t stop = nullptr) const;
auto launch(hipStream_t stream, std::size_t global, std::size_t local) const
template <class... Ts>
auto launch(hipStream_t stream, std::size_t global, std::size_t local, Ts... zs) const
{
return [=](auto&&... xs) {
launch(stream, global, local, std::vector<kernel_argument>{xs...});
launch(stream, global, local, std::vector<kernel_argument>{xs...}, zs...);
};
}
......
......@@ -80,7 +80,9 @@ void launch_kernel(hipFunction_t fun,
std::size_t global,
std::size_t local,
void* kernargs,
std::size_t size)
std::size_t size,
hipEvent_t start,
hipEvent_t stop)
{
assert(global > 0);
assert(local > 0);
......@@ -97,34 +99,55 @@ void launch_kernel(hipFunction_t fun,
#endif
};
auto status = hipExtModuleLaunchKernel(
fun, global, 1, 1, local, 1, 1, 0, stream, nullptr, reinterpret_cast<void**>(&config));
auto status = hipExtModuleLaunchKernel(fun,
global,
1,
1,
local,
1,
1,
0,
stream,
nullptr,
reinterpret_cast<void**>(&config),
start,
stop);
if(status != hipSuccess)
MIGRAPHX_THROW("Failed to launch kernel: " + hip_error(status));
if(stop)
{
status = hipEventSynchronize(stop);
if(status != hipSuccess)
MIGRAPHX_THROW("Failed to sync event: " + hip_error(status));
}
}
void kernel::launch(hipStream_t stream,
std::size_t global,
std::size_t local,
std::vector<void*> args) const
std::vector<void*> args,
hipEvent_t start,
hipEvent_t stop) const
{
assert(impl != nullptr);
void* kernargs = args.data();
std::size_t size = args.size() * sizeof(void*);
launch_kernel(impl->fun, stream, global, local, kernargs, size);
launch_kernel(impl->fun, stream, global, local, kernargs, size, start, stop);
}
void kernel::launch(hipStream_t stream,
std::size_t global,
std::size_t local,
const std::vector<kernel_argument>& args) const
const std::vector<kernel_argument>& args,
hipEvent_t start,
hipEvent_t stop) const
{
assert(impl != nullptr);
std::vector<char> kernargs = pack_args(args);
std::size_t size = kernargs.size();
launch_kernel(impl->fun, stream, global, local, kernargs.data(), size);
launch_kernel(impl->fun, stream, global, local, kernargs.data(), size, start, stop);
}
} // namespace gpu
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment