Unverified Commit 6416f066 authored by Paul Fultz II's avatar Paul Fultz II Committed by GitHub
Browse files

Merge branch 'develop' into fastsoftmax

parents 647d5dc5 97a1ed2d
......@@ -138,16 +138,16 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
std::size_t groups = (n + local - 1) / local;
std::size_t max_blocks = max_global / local;
std::size_t nglobal = std::min(max_blocks * over, groups) * local;
return nglobal;
return std::min(nglobal, n);
};
}
std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
{
size_t block_size = 128;
while(block_size <= max_block_size and block_size <= n)
block_size *= 2;
return block_size / 2;
const std::size_t min_block_size = 64;
const std::size_t base_block_size = 32;
auto block_size = (((n - 1) / base_block_size + 1)) * base_block_size;
return std::min(std::max(min_block_size, block_size), max_block_size);
}
operation compile_hip_code_object(const std::string& content, hip_compile_options options)
......
......@@ -131,7 +131,7 @@ struct hip_array
friend MIGRAPHX_DEVICE_CONSTEXPR bool operator!=(const hip_array& x, const hip_array& y)
{
return !(x == y);
return not(x == y);
}
// This uses the product order rather than lexical order
friend MIGRAPHX_DEVICE_CONSTEXPR bool operator<(const hip_array& x, const hip_array& y)
......
......@@ -117,12 +117,13 @@ template <class V, class F, class... Ts>
void hip_visit_all_impl(const shape& s, F f, V&& v, Ts&&... xs)
{
std::initializer_list<migraphx::shape::type_t> types = {get_shape(xs).type()...};
if(!std::all_of(
if(not std::all_of(
types.begin(), types.end(), [&](migraphx::shape::type_t t) { return t == s.type(); }))
MIGRAPHX_THROW("Types must be the same");
std::initializer_list<index_int> ranks = {
static_cast<index_int>(get_shape(xs).lens().size())...};
if(!std::all_of(ranks.begin(), ranks.end(), [&](index_int r) { return r == s.lens().size(); }))
if(not std::all_of(
ranks.begin(), ranks.end(), [&](index_int r) { return r == s.lens().size(); }))
MIGRAPHX_THROW("Ranks must be the same");
visit_tensor_size(s.lens().size(), [&](auto ndim) {
s.visit_type(hip_visitor([&](auto as) { v(f(xs, ndim, as)...); }));
......@@ -134,7 +135,8 @@ void hip_visit_views_impl(const shape& s, F f, V&& v, Ts&&... xs)
{
std::initializer_list<index_int> ranks = {
static_cast<index_int>(get_shape(xs).lens().size())...};
if(!std::all_of(ranks.begin(), ranks.end(), [&](index_int r) { return r == s.lens().size(); }))
if(not std::all_of(
ranks.begin(), ranks.end(), [&](index_int r) { return r == s.lens().size(); }))
MIGRAPHX_THROW("Ranks must be the same");
visit_tensor_size(s.lens().size(), [&](auto ndim) { v(f(xs, ndim)...); });
}
......
......@@ -47,7 +47,7 @@ constexpr Iterator upper_bound(Iterator first, Iterator last, const T& value)
it = first;
step = count / 2;
std::advance(it, step);
if(!(value < *it))
if(not(value < *it))
{
first = ++it;
count -= step + 1;
......
......@@ -38,8 +38,11 @@ struct compile_op : action<compile_op>
context ctx;
auto inputs = p.parse_shapes(v.at("inputs"));
auto op = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
double t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << t << "ms" << std::endl;
auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << host_time << "ms";
if(device_time > 0)
std::cout << ", " << device_time << "ms";
std::cout << std::endl;
}
};
......
......@@ -33,7 +33,8 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace driver {
double time_op(context& ctx, operation op, const std::vector<shape>& inputs, int n = 100);
std::pair<double, double>
time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n = 100);
} // namespace driver
} // namespace gpu
......
......@@ -42,22 +42,31 @@ std::vector<argument> generate_arguments(const std::vector<shape>& shapes, unsig
}
using milliseconds = std::chrono::duration<double, std::milli>;
double time_op(context& ctx, operation op, const std::vector<shape>& inputs, int n)
std::pair<double, double>
time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n)
{
// TODO: Use std::ref
migraphx::context gctx = ctx;
auto output = op.compute_shape(inputs);
op.finalize(gctx, output, inputs);
migraphx::context ctx = ictx;
auto& gctx = any_cast<migraphx::gpu::context>(ctx);
auto output = op.compute_shape(inputs);
op.finalize(ctx, output, inputs);
auto args = generate_arguments(inputs);
auto run = [&] {
op.compute(gctx, output, args);
gctx.finish();
op.compute(ctx, output, args);
ctx.finish();
};
gctx.enable_perf_measurement();
run();
auto r = range(n);
double t = std::accumulate(
r.begin(), r.end(), double{0.0}, [&](auto x, auto) { return x + time<milliseconds>(run); });
return t / n;
double host_time = 0.0;
double device_time = 0.0;
for(auto i : range(n))
{
(void)i;
host_time += time<milliseconds>(run);
device_time += gctx.get_elapsed_ms();
}
return std::make_pair(host_time / n, device_time / n);
}
} // namespace driver
......
......@@ -43,8 +43,8 @@ struct run_op : action<run_op>
auto op = make_op(name);
if(v.contains("fields"))
op.from_value(v.at("fields"));
double t = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << t << "ms" << std::endl;
auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << host_time << "ms" << std::endl;
}
};
......
......@@ -61,13 +61,25 @@ struct mlir_conv
MIGRAPHX_REGISTER_OP(mlir_conv);
namespace {
MIGRAPHX_PRED_MATCHER(is_mlir_conv, instruction_ref ins)
{
if(ins->name() != "convolution")
return false;
value v = ins->get_operator().to_value();
auto group = v.at("group").to<int>();
if(group != 1)
return false;
return true;
}
struct find_conv_pointwise
{
// Find a convolution followed by a pointwise operation.
auto matcher() const
{
auto convolution =
match::skip(match::name("contiguous"))(match::name("convolution").bind("convolution"));
match::skip(match::name("contiguous"))(is_mlir_conv().bind("convolution"));
return match::name("pointwise")(match::any_of[match::inputs()](convolution.bind("x")));
}
......
......@@ -26,7 +26,6 @@
#include <migraphx/gpu/fuse_ops.hpp>
#include <migraphx/matcher.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/clip.hpp>
#include <migraphx/gpu/convolution.hpp>
#include <migraphx/gpu/device_name.hpp>
#include <migraphx/gpu/oper.hpp>
......@@ -50,8 +49,6 @@
#include <migraphx/array.hpp>
#include <migraphx/permutation.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/op/clip.hpp>
#include <migraphx/op/contiguous.hpp>
#include <cmath>
#include <set>
......@@ -262,7 +259,7 @@ struct hip_add_relu : binary_device<hip_add_relu, &device::add_relu>
};
MIGRAPHX_REGISTER_OP(hip_add_relu)
struct hip_add_sigmoid : binary_device<hip_add_relu, &device::add_sigmoid>
struct hip_add_sigmoid : binary_device<hip_add_sigmoid, &device::add_sigmoid>
{
};
MIGRAPHX_REGISTER_OP(hip_add_sigmoid)
......@@ -1036,7 +1033,7 @@ struct find_gemm_pointwise
// const-fold input if not standard shape since rocblas can't handle it
if(not c_ins->get_shape().standard())
{
auto c = op::contiguous{};
auto c = make_op("contiguous");
auto l = c.compute(c.compute_shape({c_ins->get_shape()}), {c_ins->eval()});
c_ins = m.add_literal(l.get_shape(), l.data());
}
......
......@@ -112,7 +112,7 @@ void gemm_impl(context& ctx,
bool compute_fp32)
{
const bool is_3inputs = (args.size() == 4);
if(!is_3inputs)
if(not is_3inputs)
{
beta = 0;
}
......@@ -176,8 +176,13 @@ void gemm_impl(context& ctx,
auto num_matrices = std::accumulate(
out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
if(num_matrices == 1)
if(num_matrices == 1 or (num_matrices > 1 and get_batch_stride(args[1]) == 0))
{
// If the batch dimension of B is broadcasted, then we can
// multiply m by the batch_size and use rocblas_gemm_ex
// instead of rocblas_gemm_strided_batched_ex.
m *= num_matrices;
// the rocblas_gemm API handles inputs and output matrices as
// column-major format. When doing a C = A * B, we actually do
// C^T = (B^T) * (A^T). That is the reason we input args[1] as
......
......@@ -244,6 +244,15 @@ struct context
return hip_event_ptr{event};
}
static hip_event_ptr create_event_for_timing()
{
hipEvent_t event;
auto status = hipEventCreate(&event);
if(status != hipSuccess)
MIGRAPHX_THROW("Failed to create event");
return hip_event_ptr{event};
}
value to_value() const
{
value result;
......@@ -267,10 +276,49 @@ struct context
any_ptr get_queue() { return get_stream().get(); }
void enable_perf_measurement(bool b = true)
{
if(b)
{
start_event = create_event_for_timing();
stop_event = create_event_for_timing();
get_stream().record(start_event.get());
get_stream().record(stop_event.get());
}
else
{
start_event = nullptr;
stop_event = nullptr;
}
measure_perf = b;
}
std::pair<hipEvent_t, hipEvent_t> get_perf_events() const
{
if(measure_perf)
return std::make_pair(start_event.get(), stop_event.get());
return std::make_pair(nullptr, nullptr);
}
float get_elapsed_ms() const
{
float result = 0;
if(start_event != nullptr and stop_event != nullptr)
{
auto status = hipEventElapsedTime(&result, start_event.get(), stop_event.get());
if(status != hipSuccess)
MIGRAPHX_THROW("Failed hipEventElapsedTime: " + hip_error(status));
}
return result;
}
private:
// TODO: Make this a vector to support multiple devices
std::shared_ptr<hip_device> current_device;
std::vector<shared<hip_event_ptr>> events;
bool measure_perf = false;
shared<hip_event_ptr> start_event = nullptr;
shared<hip_event_ptr> stop_event = nullptr;
};
inline void migraphx_to_value(value& v, const context& ctx) { v = ctx.to_value(); }
......
......@@ -27,7 +27,7 @@
#include <migraphx/argument.hpp>
#include <migraphx/reflect.hpp>
#include <migraphx/op/gather.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/context.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......
......@@ -37,6 +37,8 @@ namespace gpu {
struct context;
std::string hip_error(int error);
argument allocate_gpu(const shape& s, bool host = false);
argument register_on_gpu(const argument& arg);
......
......@@ -25,7 +25,6 @@
#define MIGRAPHX_GUARD_RTGLIB_INT8_CONV_PACK_HPP
#include <migraphx/argument.hpp>
#include <migraphx/op/quant_dot.hpp>
#include <migraphx/config.hpp>
#include <utility>
......
......@@ -25,7 +25,6 @@
#define MIGRAPHX_GUARD_RTGLIB_INT8_GEMM_PACK_HPP
#include <migraphx/argument.hpp>
#include <migraphx/op/quant_dot.hpp>
#include <migraphx/config.hpp>
#include <utility>
......
......@@ -50,17 +50,22 @@ struct kernel
void launch(hipStream_t stream,
std::size_t global,
std::size_t local,
const std::vector<kernel_argument>& args) const;
const std::vector<kernel_argument>& args,
hipEvent_t start = nullptr,
hipEvent_t stop = nullptr) const;
void launch(hipStream_t stream,
std::size_t global,
std::size_t local,
std::vector<void*> args) const;
std::vector<void*> args,
hipEvent_t start = nullptr,
hipEvent_t stop = nullptr) const;
auto launch(hipStream_t stream, std::size_t global, std::size_t local) const
template <class... Ts>
auto launch(hipStream_t stream, std::size_t global, std::size_t local, Ts... zs) const
{
return [=](auto&&... xs) {
launch(stream, global, local, std::vector<kernel_argument>{xs...});
launch(stream, global, local, std::vector<kernel_argument>{xs...}, zs...);
};
}
......
......@@ -24,22 +24,10 @@
#ifndef MIGRAPHX_GUARD_RTGLIB_LOGSOFTMAX_HPP
#define MIGRAPHX_GUARD_RTGLIB_LOGSOFTMAX_HPP
#include <migraphx/gpu/lowering.hpp>
#include <migraphx/manage_ptr.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/op/logsoftmax.hpp>
#include <migraphx/generate.hpp>
#include <migraphx/shape_for_each.hpp>
#include <migraphx/config.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/hip.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/gpu/device/contiguous.hpp>
#include <migraphx/gpu/device/add.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/gpu/rocblas.hpp>
#include <migraphx/shape.hpp>
#include <migraphx/reflect.hpp>
#include <migraphx/gpu/context.hpp>
#include <utility>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......
......@@ -26,7 +26,7 @@
#include <migraphx/shape.hpp>
#include <migraphx/reflect.hpp>
#include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/context.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......
......@@ -25,7 +25,7 @@
#define MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
#include <migraphx/config.hpp>
#include <migraphx/gpu/context.hpp>
#include <string>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment