Unverified Commit 7f97b8ef authored by Ted Themistokleous's avatar Ted Themistokleous Committed by GitHub
Browse files

Merge branch 'simplify_1_mul_div_ops' into divide_by_zero_check

parents 2ba401f0 d1fed367
...@@ -33,7 +33,8 @@ inline namespace MIGRAPHX_INLINE_NS { ...@@ -33,7 +33,8 @@ inline namespace MIGRAPHX_INLINE_NS {
namespace gpu { namespace gpu {
namespace driver { namespace driver {
double time_op(context& ctx, operation op, const std::vector<shape>& inputs, int n = 100); std::pair<double, double>
time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n = 100);
} // namespace driver } // namespace driver
} // namespace gpu } // namespace gpu
......
...@@ -42,22 +42,31 @@ std::vector<argument> generate_arguments(const std::vector<shape>& shapes, unsig ...@@ -42,22 +42,31 @@ std::vector<argument> generate_arguments(const std::vector<shape>& shapes, unsig
} }
using milliseconds = std::chrono::duration<double, std::milli>; using milliseconds = std::chrono::duration<double, std::milli>;
double time_op(context& ctx, operation op, const std::vector<shape>& inputs, int n) std::pair<double, double>
time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n)
{ {
// TODO: Use std::ref // TODO: Use std::ref
migraphx::context gctx = ctx; migraphx::context ctx = ictx;
auto output = op.compute_shape(inputs); auto& gctx = any_cast<migraphx::gpu::context>(ctx);
op.finalize(gctx, output, inputs); auto output = op.compute_shape(inputs);
op.finalize(ctx, output, inputs);
auto args = generate_arguments(inputs); auto args = generate_arguments(inputs);
auto run = [&] { auto run = [&] {
op.compute(gctx, output, args); op.compute(ctx, output, args);
gctx.finish(); ctx.finish();
}; };
gctx.enable_perf_measurement();
run(); run();
auto r = range(n); double host_time = 0.0;
double t = std::accumulate( double device_time = 0.0;
r.begin(), r.end(), double{0.0}, [&](auto x, auto) { return x + time<milliseconds>(run); }); for(auto i : range(n))
return t / n; {
(void)i;
host_time += time<milliseconds>(run);
device_time += gctx.get_elapsed_ms();
}
return std::make_pair(host_time / n, device_time / n);
} }
} // namespace driver } // namespace driver
......
...@@ -43,8 +43,8 @@ struct run_op : action<run_op> ...@@ -43,8 +43,8 @@ struct run_op : action<run_op>
auto op = make_op(name); auto op = make_op(name);
if(v.contains("fields")) if(v.contains("fields"))
op.from_value(v.at("fields")); op.from_value(v.at("fields"));
double t = time_op(ctx, op, inputs, p.get(v, "iterations", 100)); auto [host_time, device_time] = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
std::cout << op << ": " << t << "ms" << std::endl; std::cout << op << ": " << host_time << "ms" << std::endl;
} }
}; };
......
...@@ -61,13 +61,25 @@ struct mlir_conv ...@@ -61,13 +61,25 @@ struct mlir_conv
MIGRAPHX_REGISTER_OP(mlir_conv); MIGRAPHX_REGISTER_OP(mlir_conv);
namespace { namespace {
MIGRAPHX_PRED_MATCHER(is_mlir_conv, instruction_ref ins)
{
if(ins->name() != "convolution")
return false;
value v = ins->get_operator().to_value();
auto group = v.at("group").to<int>();
if(group != 1)
return false;
return true;
}
struct find_conv_pointwise struct find_conv_pointwise
{ {
// Find a convolution followed by a pointwise operation. // Find a convolution followed by a pointwise operation.
auto matcher() const auto matcher() const
{ {
auto convolution = auto convolution =
match::skip(match::name("contiguous"))(match::name("convolution").bind("convolution")); match::skip(match::name("contiguous"))(is_mlir_conv().bind("convolution"));
return match::name("pointwise")(match::any_of[match::inputs()](convolution.bind("x"))); return match::name("pointwise")(match::any_of[match::inputs()](convolution.bind("x")));
} }
......
...@@ -26,30 +26,15 @@ ...@@ -26,30 +26,15 @@
#include <migraphx/gpu/fuse_ops.hpp> #include <migraphx/gpu/fuse_ops.hpp>
#include <migraphx/matcher.hpp> #include <migraphx/matcher.hpp>
#include <migraphx/gpu/miopen.hpp> #include <migraphx/gpu/miopen.hpp>
#include <migraphx/gpu/clip.hpp>
#include <migraphx/gpu/convolution.hpp> #include <migraphx/gpu/convolution.hpp>
#include <migraphx/gpu/device_name.hpp> #include <migraphx/gpu/device_name.hpp>
#include <migraphx/gpu/oper.hpp> #include <migraphx/gpu/oper.hpp>
#include <migraphx/gpu/add.hpp>
#include <migraphx/gpu/mul.hpp>
#include <migraphx/gpu/gemm.hpp> #include <migraphx/gpu/gemm.hpp>
#include <migraphx/gpu/device/layernorm.hpp>
#include <migraphx/gpu/device/gelu.hpp>
#include <migraphx/gpu/device/mul_add.hpp>
#include <migraphx/gpu/device/add_clip.hpp>
#include <migraphx/gpu/device/add_relu.hpp>
#include <migraphx/gpu/device/add_sigmoid.hpp>
#include <migraphx/gpu/device/add_tanh.hpp>
#include <migraphx/gpu/device/mul_add_relu.hpp>
#include <migraphx/gpu/device/add.hpp>
#include <migraphx/match/layernorm.hpp>
#include <migraphx/match/gelu_erf.hpp>
#include <migraphx/match/gelu_tanh.hpp>
#include <migraphx/instruction.hpp> #include <migraphx/instruction.hpp>
#include <migraphx/register_op.hpp> #include <migraphx/register_op.hpp>
#include <migraphx/array.hpp> #include <migraphx/array.hpp>
#include <migraphx/permutation.hpp>
#include <migraphx/make_op.hpp> #include <migraphx/make_op.hpp>
#include <migraphx/op/clip.hpp>
#include <cmath> #include <cmath>
#include <set> #include <set>
...@@ -225,95 +210,6 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins) ...@@ -225,95 +210,6 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
contains({{0, 0}, {1, 1}}, op.stride) and contains({{1, 1}}, op.dilation); contains({{0, 0}, {1, 1}}, op.stride) and contains({{1, 1}}, op.dilation);
} }
struct hip_triadd : ternary_device<hip_triadd, &device::add>
{
};
MIGRAPHX_REGISTER_OP(hip_triadd)
struct hip_triadd_clip : quinary_device<hip_triadd_clip, &device::add_clip>
{
};
MIGRAPHX_REGISTER_OP(hip_triadd_clip)
struct hip_add_clip : quaternary_device<hip_add_clip, &device::add_clip>
{
};
MIGRAPHX_REGISTER_OP(hip_add_clip)
struct hip_triadd_relu : ternary_device<hip_triadd_relu, &device::add_relu>
{
};
MIGRAPHX_REGISTER_OP(hip_triadd_relu)
struct hip_triadd_sigmoid : ternary_device<hip_triadd_sigmoid, &device::add_sigmoid>
{
};
MIGRAPHX_REGISTER_OP(hip_triadd_sigmoid)
struct hip_triadd_tanh : ternary_device<hip_triadd_tanh, &device::add_tanh>
{
};
MIGRAPHX_REGISTER_OP(hip_triadd_tanh)
struct hip_add_relu : binary_device<hip_add_relu, &device::add_relu>
{
};
MIGRAPHX_REGISTER_OP(hip_add_relu)
struct hip_add_sigmoid : binary_device<hip_add_relu, &device::add_sigmoid>
{
};
MIGRAPHX_REGISTER_OP(hip_add_sigmoid)
struct hip_add_tanh : binary_device<hip_add_tanh, &device::add_tanh>
{
};
MIGRAPHX_REGISTER_OP(hip_add_tanh)
struct hip_layernorm : unary_device<hip_layernorm, &device::layernorm>
{
// Empty finalize to skip dimension reduction
void finalize(context&, const shape&, const std::vector<shape>&) {}
};
MIGRAPHX_REGISTER_OP(hip_layernorm)
struct hip_triadd_layernorm : ternary_device<hip_triadd_layernorm, &device::triadd_layernorm>
{
// Empty finalize to skip dimension reduction
void finalize(context&, const shape&, const std::vector<shape>&) {}
};
MIGRAPHX_REGISTER_OP(hip_triadd_layernorm)
struct hip_gelu : unary_device<hip_gelu, &device::gelu>
{
};
MIGRAPHX_REGISTER_OP(hip_gelu)
struct hip_add_gelu : binary_device<hip_add_gelu, &device::add_gelu>
{
};
MIGRAPHX_REGISTER_OP(hip_add_gelu)
struct hip_gelu_new : unary_device<hip_gelu_new, &device::gelu_new>
{
};
MIGRAPHX_REGISTER_OP(hip_gelu_new)
struct hip_add_gelu_new : binary_device<hip_add_gelu_new, &device::add_gelu_new>
{
};
MIGRAPHX_REGISTER_OP(hip_add_gelu_new)
struct hip_mul_add : ternary_device<hip_mul_add, &device::mul_add>
{
};
MIGRAPHX_REGISTER_OP(hip_mul_add)
struct hip_mul_add_relu : ternary_device<hip_mul_add_relu, &device::mul_add_relu>
{
};
MIGRAPHX_REGISTER_OP(hip_mul_add_relu)
void move_broadcasted_back(std::vector<instruction_ref>& args) void move_broadcasted_back(std::vector<instruction_ref>& args)
{ {
// Ensure the last arguments is the broadcasted one // Ensure the last arguments is the broadcasted one
...@@ -337,256 +233,6 @@ void move_standard_front(std::vector<instruction_ref>& args) ...@@ -337,256 +233,6 @@ void move_standard_front(std::vector<instruction_ref>& args)
auto gpu_name(const std::string& s) { return match::name("gpu::" + s); } auto gpu_name(const std::string& s) { return match::name("gpu::" + s); }
namespace { namespace {
struct find_layernorm
{
auto matcher() const { return match::layernorm(&gpu_name); }
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto x_ins = r.instructions["x"];
auto args = ins->inputs();
// We dont fuse for non-standard layouts
if(not x_ins->get_shape().standard())
return;
auto relements = x_ins->get_shape().lens().back();
if(relements > 1024 or (relements % 4 != 0 and relements > 256))
return;
m.replace_instruction(ins, hip_layernorm{}, x_ins, args.back());
}
};
struct find_triadd_layernorm
{
auto matcher() const
{
return match::name("gpu::layernorm")(match::arg(0)(match::name("gpu::triadd")(
match::used_once(), match::all_of[match::inputs()](match::standard_shape()))));
}
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto triadd = ins->inputs().front();
m.replace_instruction(ins, hip_triadd_layernorm{}, triadd->inputs());
}
};
struct find_gelu
{
auto matcher() const { return match::gelu_erf(&gpu_name); }
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto x_ins = r.instructions["x"];
auto args = ins->inputs();
m.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
}
};
struct find_add_gelu
{
auto matcher() const
{
return match::name("gpu::gelu")(match::arg(0)(match::name("gpu::add").bind("add")));
}
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto ins = r.result;
auto args = add_ins->inputs();
move_standard_front(args);
move_broadcasted_back(args);
args.back() = ins->inputs().back();
m.replace_instruction(ins, hip_add_gelu{}, args);
}
};
struct find_gelu_new
{
bool fast_math = true;
auto matcher() const { return match::gelu_tanh(&gpu_name); }
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto x_ins = r.instructions["x"];
auto args = ins->inputs();
if(fast_math)
m.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
else
m.replace_instruction(ins, hip_gelu_new{}, x_ins, args.back());
}
};
struct find_add_gelu_new
{
auto matcher() const
{
return match::name("gpu::gelu_new")(match::arg(0)(match::name("gpu::add").bind("add")));
}
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto ins = r.result;
auto args = add_ins->inputs();
move_standard_front(args);
move_broadcasted_back(args);
args.back() = ins->inputs().back();
m.replace_instruction(ins, hip_add_gelu_new{}, args);
}
};
struct find_add_clip
{
auto matcher() const
{
return match::name(std::unordered_set<std::string>{"gpu::clip", "gpu::clipped_relu"})(
match::arg(0)(match::any_of(match::name("gpu::add"),
match::name("gpu::triadd"),
match::any_of[match::inputs()](match::standard_shape()))
.bind("add")));
}
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto ins = r.result;
auto ins_args = ins->inputs();
auto add_args = add_ins->inputs();
move_standard_front(add_args);
move_broadcasted_back(add_args);
// Use the allocation from the clip operator
add_args.pop_back();
add_args.insert(add_args.end(), std::next(ins_args.begin()), ins_args.end());
if(add_ins->name() == "gpu::add")
m.replace_instruction(ins, hip_add_clip{}, add_args);
else if(add_ins->name() == "gpu::triadd")
m.replace_instruction(ins, hip_triadd_clip{}, add_args);
}
};
struct find_add_unary
{
std::string op_name;
operation binary_add_op;
operation ternary_add_op;
auto matcher() const
{
return match::name(op_name)(match::arg(0)(
match::used_once(),
match::any_of(match::name("gpu::add"),
match::name("gpu::triadd"),
match::any_of(match::name("@literal"),
match::any_of[match::inputs()](match::standard_shape())))
.bind("add")));
}
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto ins = r.result;
auto args = add_ins->inputs();
move_standard_front(args);
move_broadcasted_back(args);
// Use the allocation from the relu operator
args.back() = ins->inputs().back();
if(add_ins->name() == "gpu::add")
m.replace_instruction(ins, binary_add_op, args);
else if(add_ins->name() == "gpu::triadd")
m.replace_instruction(ins, ternary_add_op, args);
}
};
struct find_triadd
{
auto matcher() const
{
return match::name("gpu::add")(match::either_arg(0, 1)(
match::name("gpu::add")(match::used_once()).bind("add"),
match::any(match::any_of(match::name("@literal"),
match::any_of[match::inputs()](match::standard_shape())))
.bind("input")));
}
void apply(module& m, const match::matcher_result& r) const
{
auto add_ins = r.instructions["add"];
auto input_ins = r.instructions["input"];
auto ins = r.result;
auto args = add_ins->inputs();
auto is_broadcasted = [](auto arg) { return arg->get_shape().broadcasted(); };
if(std::count_if(args.begin(), args.end(), is_broadcasted) > 2)
return;
args.insert(args.begin(), input_ins);
move_standard_front(args);
move_broadcasted_back(args);
args.back() = ins->inputs().back();
m.replace_instruction(ins, hip_triadd{}, args);
}
};
struct find_mul_add
{
auto matcher() const
{
return match::name("gpu::add")(match::either_arg(0, 1)(
match::name("gpu::mul")(match::used_once()).bind("mul"), match::any().bind("b")));
}
void apply(module& m, const match::matcher_result& r) const
{
auto mul_ins = r.instructions["mul"];
auto b_ins = r.instructions["b"];
auto ins = r.result;
auto args = mul_ins->inputs();
assert(mul_ins != b_ins);
move_standard_front(args);
move_broadcasted_back(args);
args.insert(std::prev(args.end()), b_ins);
args.back() = ins->inputs().back();
m.replace_instruction(ins, hip_mul_add{}, args);
}
};
struct find_mul_add_relu
{
auto matcher() const
{
return match::name("gpu::relu")(
match::arg(0)(match::name("gpu::mul_add")(match::used_once()).bind("mul_add")));
}
void apply(module& m, const match::matcher_result& r) const
{
auto mul_add_ins = r.instructions["mul_add"];
auto ins = r.result;
auto args = mul_add_ins->inputs();
// Use the allocation from the relu operator
args.back() = ins->inputs().back();
m.replace_instruction(ins, hip_mul_add_relu{}, args);
}
};
struct miopen_fusion struct miopen_fusion
{ {
struct fuse_op_data struct fuse_op_data
...@@ -827,13 +473,14 @@ void apply_conv_bias(context& ctx, module& m, const match::matcher_result& r) ...@@ -827,13 +473,14 @@ void apply_conv_bias(context& ctx, module& m, const match::matcher_result& r)
m.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins); m.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins);
} }
inline auto precompile_name(std::string s) // NOLINT template <class... Strings>
inline auto precompile_name(Strings... names) // NOLINT
{ {
return match::make_basic_pred_matcher([=](instruction_ref ins) { return match::make_basic_pred_matcher([=](instruction_ref ins) {
if(ins->name() != "gpu::precompile_op") if(ins->name() != "gpu::precompile_op")
return false; return false;
auto op = from_value<operation>(ins->get_operator().to_value().at("op")); auto op = from_value<operation>(ins->get_operator().to_value().at("op"));
return (op.name() == s); return (contains({names...}, op.name()));
}); });
} }
...@@ -902,14 +549,70 @@ struct find_conv_pointwise ...@@ -902,14 +549,70 @@ struct find_conv_pointwise
} }
}; };
struct find_gemm_add struct find_gemm_pointwise
{ {
auto matcher() const auto matcher() const
{ {
return match::name("gpu::add")( return precompile_name("pointwise")(
match::all_of[match::inputs()](match::standard_shape()), match::nargs(3),
match::either_arg(0, 1)(match::used_once().bind("c"), match::either_arg(0, 1)(
match::name("gpu::gemm")(match::nargs(3)).bind("gemm"))); match::any_of(match::standard_shape(), match::is_constant()).bind("c"),
match::name("gpu::gemm")(match::nargs(3), match::used_once()).bind("gemm")));
}
// TODO: Move to matcher.hpp
static auto match_param(const std::string& name)
{
return match::make_basic_pred_matcher([=](auto ins) {
if(ins->name() != "@param")
return false;
auto p = any_cast<builtin::param>(ins->get_operator());
return p.parameter == name;
});
}
template <class M>
static auto match_mul_const(M m, const std::string& var)
{
return match::name("mul")(match::either_arg(0, 1)(match::name("@literal").bind(var), m))
.bind(var + "_mul");
}
static auto match_add(const std::string& input, const std::string& output)
{
auto param = match::name("@param");
auto add = match::name("add")(match::args(param, param));
auto inner_mul = match::any_of(match_mul_const(match_param(input), "alpha"),
match_mul_const(match_param(output), "beta"));
auto mul_add = match::name("add")(match::either_arg(0, 1)(inner_mul, param));
auto add_mul = match_mul_const(add, "gamma");
return match::name("@return")(match::args(match::any_of(add, mul_add, add_mul)));
}
static float get_float(instruction_ref ins) { return ins->get_literal().at<float>(); }
template <class Gemm>
static bool update_gemm(Gemm& gemm, module_ref pm, unsigned input)
{
auto names = pm->get_parameter_names();
if(names.size() != 2)
return false;
std::sort(names.begin(), names.end());
unsigned output = input == 0 ? 1 : 0;
auto mr = match::match_instruction(
*pm, std::prev(pm->end()), match_add(names[input], names[output]));
if(mr.result == pm->end())
return false;
if(contains(mr.instructions, "alpha_mul"))
gemm.alpha *= get_float(mr.instructions["alpha"]);
else if(contains(mr.instructions, "beta_mul"))
gemm.beta *= get_float(mr.instructions["beta"]);
else if(contains(mr.instructions, "gamma_mul"))
{
gemm.alpha *= get_float(mr.instructions["gamma"]);
gemm.beta *= get_float(mr.instructions["gamma"]);
}
return true;
} }
void apply(module& m, const match::matcher_result& r) const void apply(module& m, const match::matcher_result& r) const
...@@ -923,69 +626,85 @@ struct find_gemm_add ...@@ -923,69 +626,85 @@ struct find_gemm_add
// Already fused gemm // Already fused gemm
if(not float_equal(gemm.beta, 0)) if(not float_equal(gemm.beta, 0))
return; return;
gemm.beta = 1;
auto inputs = gemm_ins->inputs(); if(not update_gemm(
inputs.pop_back(); gemm, ins->module_inputs().front(), ins->inputs().front() == gemm_ins ? 0 : 1))
return;
auto copy_ins = c_ins;
// Insert copy // const-fold input if not standard shape since rocblas can't handle it
if(ins == m.end() or c_ins->outputs().size() > 1 or c_ins->inputs().empty()) if(not c_ins->get_shape().standard())
{ {
copy_ins = m.insert_instruction(ins, hip_copy{}, c_ins, ins->inputs().back()); auto c = make_op("contiguous");
auto l = c.compute(c.compute_shape({c_ins->get_shape()}), {c_ins->eval()});
c_ins = m.add_literal(l.get_shape(), l.data());
} }
inputs.push_back(copy_ins);
inputs.push_back(copy_ins);
gemm.beta = 1; auto inputs = gemm_ins->inputs();
inputs.pop_back();
inputs.push_back(c_ins);
inputs.push_back(ins->inputs().back());
m.replace_instruction(ins, gemm, inputs); m.replace_instruction(ins, gemm, inputs);
} }
}; };
auto pointwise_name(const std::string& s) struct find_contiguous_tranpose_gemm
{
return precompile_name("pointwise")(match::make_basic_pred_matcher([=](auto ins) {
module_ref pm = ins->module_inputs().front();
auto n = std::count_if(pm->begin(), pm->end(), [&](auto& i) { return i.name() == s; });
if(n != 1)
return false;
return std::all_of(pm->begin(), pm->end(), [&](auto& i) {
return starts_with(i.name(), "@") or i.name() == s;
});
}));
}
struct find_gemm_pointwise
{ {
auto matcher() const auto matcher() const
{ {
return pointwise_name("add")( return match::name("gpu::contiguous")(match::arg(0)(
match::nargs(3), match::name("transpose")(
match::all_of[match::inputs()](match::standard_shape()), match::arg(0)(match::name("gpu::gemm")(match::used_once()).bind("gemm")))
match::either_arg(0, 1)(match::used_once().bind("c"), .bind("transpose")));
match::name("gpu::gemm")(match::nargs(3)).bind("gemm"))); }
template <class Vector>
static bool is_swapped(const Vector& perm, std::size_t i, std::size_t j)
{
if(i >= perm.size() or j >= perm.size())
return false;
auto perm2 = perm;
std::iota(perm2.begin(), perm2.end(), 0);
std::swap(perm2[i], perm2[j]);
return perm2 == perm;
} }
void apply(module& m, const match::matcher_result& r) const void apply(module& m, const match::matcher_result& r) const
{ {
auto ins = r.result; auto ins = r.result;
auto gemm_ins = r.instructions["gemm"]; auto gemm = r.instructions["gemm"];
auto c_ins = r.instructions["c"]; auto alloc = gemm->inputs().back();
auto transpose = r.instructions["transpose"];
auto perm = transpose->get_operator().to_value()["permutation"].to_vector<int64_t>();
auto iperm = invert_permutation(perm);
auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator()); if(perm.size() < 3)
return;
// Already fused gemm if(not is_swapped(perm, perm.size() - 3, perm.size() - 2))
if(not float_equal(gemm.beta, 0))
return; return;
auto inputs = gemm_ins->inputs(); auto lens = gemm->get_shape().lens();
inputs.pop_back(); if(lens.size() > 3 and
not std::all_of(lens.begin(), lens.end() - 3, [](auto i) { return i == 1; }))
return;
inputs.push_back(c_ins); auto gemmv = gemm->get_operator().to_value();
inputs.push_back(ins->inputs().back()); gemmv["trans_batch"] = 1;
gemm.beta = 1; auto s = shape{alloc->get_shape().type(), reorder_dims(alloc->get_shape().lens(), iperm)};
m.replace_instruction(ins, gemm, inputs); auto new_alloc = m.insert_instruction(gemm, make_op("allocate", {{"shape", to_value(s)}}));
auto alloc_transpose =
m.insert_instruction(gemm, make_op("transpose", {{"permutation", perm}}), new_alloc);
auto inputs = gemm->inputs();
inputs.back() = alloc_transpose;
auto new_gemm = m.insert_instruction(gemm, make_op("gpu::gemm", gemmv), inputs);
auto gemm_transpoe = m.insert_instruction(gemm, transpose->get_operator(), new_gemm);
m.replace_instruction(ins, gemm_transpoe);
} }
}; };
...@@ -1041,29 +760,41 @@ struct find_contiguous_pointwise ...@@ -1041,29 +760,41 @@ struct find_contiguous_pointwise
} }
}; };
struct find_layernorm_pointwise
{
auto matcher() const
{
return precompile_name("pointwise")(match::arg(0)(
precompile_name("gpu::prelayernorm", "gpu::preadd_layernorm").bind("layernorm")));
}
void apply(module& m, const match::matcher_result& r) const
{
auto ins = r.result;
auto layernorm = r.instructions["layernorm"];
auto* pm = ins->module_inputs().front();
if(not layernorm->module_inputs().empty())
return;
auto inputs = layernorm->inputs();
inputs.pop_back();
inputs.insert(inputs.end(), ins->inputs().begin() + 1, ins->inputs().end());
m.replace_instruction(ins, layernorm->get_operator(), inputs, {pm});
}
};
void fuse_ops::apply(module& m) const void fuse_ops::apply(module& m) const
{ {
match::find_matches(m, find_contiguous_pointwise{}, find_gelu{}, find_gelu_new{fast_math}); match::find_matches(m, find_contiguous_pointwise{});
run_passes(m, {dead_code_elimination{}}); run_passes(m, {dead_code_elimination{}});
match::find_matches(m, find_triadd{}); match::find_matches(m, find_conv_pointwise{ctx}, find_conv_bias_relu{ctx}, find_conv_bias{ctx});
match::find_matches(m,
find_layernorm{},
find_conv_pointwise{ctx},
find_conv_bias_relu{ctx},
find_conv_bias{ctx},
find_add_gelu{},
find_add_gelu_new{},
find_mul_add{},
find_mul_add_relu{},
find_add_unary{"gpu::relu", hip_add_relu{}, hip_triadd_relu{}},
find_add_unary{"gpu::sigmoid", hip_add_sigmoid{}, hip_triadd_sigmoid{}},
find_add_unary{"gpu::tanh", hip_add_tanh{}, hip_triadd_tanh{}},
find_add_clip{});
run_passes(m, {dead_code_elimination{}}); run_passes(m, {dead_code_elimination{}});
match::find_matches(m, match::find_matches(m,
find_triadd_layernorm{}, find_layernorm_pointwise{},
find_gemm_add{},
find_gemm_pointwise{}, find_gemm_pointwise{},
find_contiguous_tranpose_gemm{},
find_commutative_broadcast{}); find_commutative_broadcast{});
match::find_matches(m, find_contiguous{}); match::find_matches(m, find_contiguous{});
} }
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <rocblas.h> #include <rocblas.h>
#include <migraphx/gpu/gemm_impl.hpp> #include <migraphx/gpu/gemm_impl.hpp>
#include <migraphx/reduce_dims.hpp> #include <migraphx/reduce_dims.hpp>
#include <migraphx/permutation.hpp>
namespace migraphx { namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
...@@ -67,6 +68,19 @@ void blas_shape(const shape& s) ...@@ -67,6 +68,19 @@ void blas_shape(const shape& s)
MIGRAPHX_THROW("GPU_GEMM: Batch dimension is not collapsible"); MIGRAPHX_THROW("GPU_GEMM: Batch dimension is not collapsible");
} }
shape transpose_batch(const shape& s, unsigned trans_batch)
{
if(trans_batch == 0)
return s;
if(s.lens().size() < 3)
return s;
auto batch = s.lens().size() - 3;
std::vector<int64_t> perm(s.lens().size());
std::iota(perm.begin(), perm.end(), 0);
std::swap(perm[batch], perm[batch + trans_batch]);
return shape::from_permutation(s.type(), s.lens(), perm);
}
template <class R, class... Ts, class... Us> template <class R, class... Ts, class... Us>
R rocblas_invoke(R (*f)(Ts...), Us... xs) R rocblas_invoke(R (*f)(Ts...), Us... xs)
{ {
...@@ -97,6 +111,12 @@ void gemm_impl(context& ctx, ...@@ -97,6 +111,12 @@ void gemm_impl(context& ctx,
bool int8_x4_format, bool int8_x4_format,
bool compute_fp32) bool compute_fp32)
{ {
const bool is_3inputs = (args.size() == 4);
if(not is_3inputs)
{
beta = 0;
}
bool transa = is_transposed(args[0].get_shape()); bool transa = is_transposed(args[0].get_shape());
bool transb = is_transposed(args[1].get_shape()); bool transb = is_transposed(args[1].get_shape());
auto n_dim = output_shape.lens().size(); auto n_dim = output_shape.lens().size();
...@@ -105,12 +125,8 @@ void gemm_impl(context& ctx, ...@@ -105,12 +125,8 @@ void gemm_impl(context& ctx,
rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0]; rocblas_int lda = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0]; rocblas_int ldb = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
rocblas_int ldc = args[2].get_shape().strides()[dim_0]; rocblas_int ldc = args[2].get_shape().strides()[dim_0];
rocblas_int ldd = is_3inputs ? args[3].get_shape().strides()[dim_0] : ldc;
bool is_3inputs = (args.size() == 4);
if(!is_3inputs)
{
beta = 0;
}
rocblas_datatype arg_type = get_type(args[0].get_shape().type()); rocblas_datatype arg_type = get_type(args[0].get_shape().type());
auto output_type = arg_type; auto output_type = arg_type;
if(output_type == rocblas_datatype_i8_r) if(output_type == rocblas_datatype_i8_r)
...@@ -160,8 +176,13 @@ void gemm_impl(context& ctx, ...@@ -160,8 +176,13 @@ void gemm_impl(context& ctx,
auto num_matrices = std::accumulate( auto num_matrices = std::accumulate(
out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>()); out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
if(num_matrices == 1) if(num_matrices == 1 or (num_matrices > 1 and get_batch_stride(args[1]) == 0))
{ {
// If the batch dimension of B is broadcasted, then we can
// multiply m by the batch_size and use rocblas_gemm_ex
// instead of rocblas_gemm_strided_batched_ex.
m *= num_matrices;
// the rocblas_gemm API handles inputs and output matrices as // the rocblas_gemm API handles inputs and output matrices as
// column-major format. When doing a C = A * B, we actually do // column-major format. When doing a C = A * B, we actually do
// C^T = (B^T) * (A^T). That is the reason we input args[1] as // C^T = (B^T) * (A^T). That is the reason we input args[1] as
...@@ -186,7 +207,7 @@ void gemm_impl(context& ctx, ...@@ -186,7 +207,7 @@ void gemm_impl(context& ctx,
ldc, ldc,
is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]), is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
output_type, output_type,
ldc, ldd,
compute_type, compute_type,
rocblas_gemm_algo_standard, rocblas_gemm_algo_standard,
0, 0,
...@@ -197,6 +218,7 @@ void gemm_impl(context& ctx, ...@@ -197,6 +218,7 @@ void gemm_impl(context& ctx,
auto a_stride = get_batch_stride(args[0]); auto a_stride = get_batch_stride(args[0]);
auto b_stride = get_batch_stride(args[1]); auto b_stride = get_batch_stride(args[1]);
auto c_stride = get_batch_stride(args[2]); auto c_stride = get_batch_stride(args[2]);
auto d_stride = is_3inputs ? get_batch_stride(args[3]) : c_stride;
rocblas_invoke(&rocblas_gemm_strided_batched_ex, rocblas_invoke(&rocblas_gemm_strided_batched_ex,
ctx.get_stream().get_rocblas(), ctx.get_stream().get_rocblas(),
transb ? rocblas_operation_transpose : rocblas_operation_none, transb ? rocblas_operation_transpose : rocblas_operation_none,
...@@ -220,8 +242,8 @@ void gemm_impl(context& ctx, ...@@ -220,8 +242,8 @@ void gemm_impl(context& ctx,
c_stride, c_stride,
is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]), is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
output_type, output_type,
ldc, ldd,
c_stride, d_stride,
num_matrices, num_matrices,
compute_type, compute_type,
rocblas_gemm_algo_standard, rocblas_gemm_algo_standard,
......
...@@ -23,13 +23,13 @@ ...@@ -23,13 +23,13 @@
*/ */
#include <migraphx/gpu/hip.hpp> #include <migraphx/gpu/hip.hpp>
#include <migraphx/manage_ptr.hpp> #include <migraphx/manage_ptr.hpp>
#include <migraphx/register_op.hpp> #include <migraphx/register_op.hpp>
#include <migraphx/gpu/context.hpp> #include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/device/contiguous.hpp> #include <migraphx/gpu/device/contiguous.hpp>
#include <miopen/miopen.h> #include <miopen/miopen.h>
#include <memory>
#include <mutex>
#include <vector> #include <vector>
namespace migraphx { namespace migraphx {
...@@ -77,12 +77,38 @@ void* get_device_ptr(void* hptr) ...@@ -77,12 +77,38 @@ void* get_device_ptr(void* hptr)
return result; return result;
} }
hip_ptr allocate_gpu(std::size_t sz, bool host = false) struct host_ptr_cache
{
std::unordered_map<void*, std::weak_ptr<void>> cache;
std::mutex m;
std::shared_ptr<void> get(void* ptr)
{
std::lock_guard<std::mutex> lock(m);
auto it = cache.find(ptr);
if(it != cache.end())
return it->second.lock();
return nullptr;
}
void put(const std::shared_ptr<void>& p)
{
std::lock_guard<std::mutex> lock(m);
cache[p.get()] = p;
}
};
static host_ptr_cache& get_host_ptr_cache()
{
static host_ptr_cache cache;
return cache;
}
std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host = false)
{ {
if(sz > get_available_gpu_memory()) if(sz > get_available_gpu_memory())
MIGRAPHX_THROW("Memory not available to allocate buffer: " + std::to_string(sz)); MIGRAPHX_THROW("Memory not available to allocate buffer: " + std::to_string(sz));
void* result = nullptr; void* alloc_ptr = nullptr;
auto status = host ? hipHostMalloc(&result, sz) : hipMalloc(&result, sz); auto status = host ? hipHostMalloc(&alloc_ptr, sz) : hipMalloc(&alloc_ptr, sz);
if(status != hipSuccess) if(status != hipSuccess)
{ {
if(host) if(host)
...@@ -90,16 +116,28 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false) ...@@ -90,16 +116,28 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
else else
return allocate_gpu(sz, true); return allocate_gpu(sz, true);
} }
assert(result != nullptr); assert(alloc_ptr != nullptr);
return hip_ptr{result}; std::shared_ptr<void> result = share(hip_ptr{alloc_ptr});
if(host)
{
get_host_ptr_cache().put(result);
}
return result;
} }
hip_host_ptr register_on_gpu(void* ptr, std::size_t sz) std::shared_ptr<void> register_on_gpu(void* ptr, std::size_t sz)
{ {
std::shared_ptr<void> result = get_host_ptr_cache().get(ptr);
if(result)
{
return result;
}
auto status = hipHostRegister(ptr, sz, hipHostRegisterMapped); auto status = hipHostRegister(ptr, sz, hipHostRegisterMapped);
if(status != hipSuccess) if(status != hipSuccess)
MIGRAPHX_THROW("Gpu register failed: " + hip_error(status)); MIGRAPHX_THROW("Gpu register failed: " + hip_error(status));
return hip_host_ptr{ptr}; result = share(hip_host_ptr{ptr});
get_host_ptr_cache().put(result);
return result;
} }
template <class T> template <class T>
...@@ -115,7 +153,7 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz) ...@@ -115,7 +153,7 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
return result; return result;
} }
hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false) std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host = false)
{ {
gpu_sync(); gpu_sync();
auto result = allocate_gpu(sz, host); auto result = allocate_gpu(sz, host);
...@@ -137,22 +175,21 @@ hip_ptr write_to_gpu(const T& x) ...@@ -137,22 +175,21 @@ hip_ptr write_to_gpu(const T& x)
argument allocate_gpu(const shape& s, bool host) argument allocate_gpu(const shape& s, bool host)
{ {
auto p = share(allocate_gpu(s.bytes() + 1, host)); auto p = allocate_gpu(s.bytes() + 1, host);
return {s, [p]() mutable { return reinterpret_cast<char*>(p.get()); }}; return {s, [p]() mutable { return reinterpret_cast<char*>(p.get()); }};
} }
argument register_on_gpu(const argument& arg) argument register_on_gpu(const argument& arg)
{ {
auto arg_shared = arg.share(); auto arg_shared = arg.share();
auto p = share(register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes())); auto p = register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes());
return {arg_shared.get_shape(), [p, a = std::move(arg_shared)]() mutable { return {arg_shared.get_shape(),
return get_device_ptr(p.get()); [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
}}; // namespace gpu }
} // namespace MIGRAPHX_INLINE_NS
argument to_gpu(const argument& arg, bool host) argument to_gpu(const argument& arg, bool host)
{ {
auto p = share(write_to_gpu(arg.data(), arg.get_shape().bytes(), host)); auto p = write_to_gpu(arg.data(), arg.get_shape().bytes(), host);
return {arg.get_shape(), p}; return {arg.get_shape(), p};
} }
......
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_RTGLIB_ASINH_HPP
#define MIGRAPHX_GUARD_RTGLIB_ASINH_HPP
#include <migraphx/gpu/oper.hpp>
#include <migraphx/gpu/device/asinh.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct hip_asinh : unary_device<hip_asinh, device::asinh>
{
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_RTGLIB_ATAN_HPP
#define MIGRAPHX_GUARD_RTGLIB_ATAN_HPP
#include <migraphx/gpu/oper.hpp>
#include <migraphx/gpu/device/atan.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct hip_atan : unary_device<hip_atan, device::atan>
{
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_RTGLIB_ATANH_HPP
#define MIGRAPHX_GUARD_RTGLIB_ATANH_HPP
#include <migraphx/gpu/oper.hpp>
#include <migraphx/gpu/device/atanh.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct hip_atanh : unary_device<hip_atanh, device::atanh>
{
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_RTGLIB_CEIL_HPP
#define MIGRAPHX_GUARD_RTGLIB_CEIL_HPP
#include <migraphx/gpu/oper.hpp>
#include <migraphx/gpu/device/ceil.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct hip_ceil : unary_device<hip_ceil, device::ceil>
{
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#define MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP #define MIGRAPHX_GUARD_GPU_COMPILE_GEN_HPP
#include <migraphx/config.hpp> #include <migraphx/config.hpp>
#include <migraphx/module_ref.hpp>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
...@@ -35,6 +36,9 @@ inline namespace MIGRAPHX_INLINE_NS { ...@@ -35,6 +36,9 @@ inline namespace MIGRAPHX_INLINE_NS {
struct shape; struct shape;
namespace gpu { namespace gpu {
struct context;
namespace gen { namespace gen {
struct vectorize struct vectorize
...@@ -42,6 +46,10 @@ struct vectorize ...@@ -42,6 +46,10 @@ struct vectorize
std::size_t size = 1; std::size_t size = 1;
std::size_t axis = 0; std::size_t axis = 0;
static vectorize elements(std::size_t axis, const std::vector<shape>& inputs); static vectorize elements(std::size_t axis, const std::vector<shape>& inputs);
static vectorize elements(context& ctx, std::size_t axis, const std::vector<shape>& inputs);
static vectorize elements(std::size_t axis,
const std::vector<shape>& inputs,
const std::vector<std::size_t>& sizes);
std::string str() const; std::string str() const;
}; };
struct preload struct preload
...@@ -62,6 +70,10 @@ std::string make_transformer_args(Ts... xs) ...@@ -62,6 +70,10 @@ std::string make_transformer_args(Ts... xs)
return make_transformer_args({xs.str()...}); return make_transformer_args({xs.str()...});
} }
std::string generate_pointwise(const module& pm, const std::string& name);
std::string generate_name_from_ops(const module& m);
} // namespace gen } // namespace gen
} // namespace gpu } // namespace gpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
......
...@@ -24,8 +24,9 @@ ...@@ -24,8 +24,9 @@
#ifndef MIGRAPHX_GUARD_RTGLIB_CONCAT_GPU_OPT_HPP #ifndef MIGRAPHX_GUARD_RTGLIB_CONCAT_GPU_OPT_HPP
#define MIGRAPHX_GUARD_RTGLIB_CONCAT_GPU_OPT_HPP #define MIGRAPHX_GUARD_RTGLIB_CONCAT_GPU_OPT_HPP
#include <migraphx/gpu/concat.hpp> #include <migraphx/op/concat.hpp>
#include <migraphx/operation.hpp> #include <migraphx/operation.hpp>
#include <migraphx/serialize.hpp>
namespace migraphx { namespace migraphx {
namespace gpu { namespace gpu {
...@@ -36,7 +37,8 @@ struct concat_gpu_optimization ...@@ -36,7 +37,8 @@ struct concat_gpu_optimization
std::string allocate() const { return "hip::allocate"; } std::string allocate() const { return "hip::allocate"; }
migraphx::op::concat get_concat(const migraphx::operation& op) const migraphx::op::concat get_concat(const migraphx::operation& op) const
{ {
return migraphx::any_cast<migraphx::gpu::hip_concat>(op).op; auto v = op.to_value();
return from_value<migraphx::op::concat>(v.at("op"));
} }
}; };
......
...@@ -197,7 +197,9 @@ struct hip_device ...@@ -197,7 +197,9 @@ struct hip_device
struct context struct context
{ {
context(std::size_t device_id = 0, std::size_t n = value_of(MIGRAPHX_NSTREAMS{}, 1)) context(std::size_t device_id = 0, std::size_t n = value_of(MIGRAPHX_NSTREAMS{}, 1))
: current_device(std::make_shared<hip_device>(device_id, n)) : current_device(std::make_shared<hip_device>(device_id, n)),
begin_event(create_event()),
finish_event(create_event())
{ {
} }
...@@ -244,6 +246,15 @@ struct context ...@@ -244,6 +246,15 @@ struct context
return hip_event_ptr{event}; return hip_event_ptr{event};
} }
static hip_event_ptr create_event_for_timing()
{
hipEvent_t event;
auto status = hipEventCreate(&event);
if(status != hipSuccess)
MIGRAPHX_THROW("Failed to create event");
return hip_event_ptr{event};
}
value to_value() const value to_value() const
{ {
value result; value result;
...@@ -265,12 +276,73 @@ struct context ...@@ -265,12 +276,73 @@ struct context
this->current_device = std::make_shared<hip_device>(0, n_streams); this->current_device = std::make_shared<hip_device>(0, n_streams);
} }
void wait_for(any_ptr queue)
{
auto status = hipEventRecord(begin_event.get(), queue.get<hipStream_t>());
if(status != hipSuccess)
MIGRAPHX_THROW("failed to record " + hip_error(status));
get_stream().wait(begin_event.get());
}
void finish_on(any_ptr queue)
{
get_stream().record(finish_event.get());
auto status = hipStreamWaitEvent(queue.get<hipStream_t>(), finish_event.get(), 0);
if(status != hipSuccess)
MIGRAPHX_THROW("Failed to wait on event " + hip_error(status));
}
any_ptr get_queue() { return get_stream().get(); } any_ptr get_queue() { return get_stream().get(); }
void enable_perf_measurement(bool b = true)
{
if(b)
{
start_event = create_event_for_timing();
stop_event = create_event_for_timing();
get_stream().record(start_event.get());
get_stream().record(stop_event.get());
}
else
{
start_event = nullptr;
stop_event = nullptr;
}
measure_perf = b;
}
std::pair<hipEvent_t, hipEvent_t> get_perf_events() const
{
if(measure_perf)
return std::make_pair(start_event.get(), stop_event.get());
return std::make_pair(nullptr, nullptr);
}
float get_elapsed_ms() const
{
float result = 0;
if(start_event != nullptr and stop_event != nullptr)
{
auto status = hipEventElapsedTime(&result, start_event.get(), stop_event.get());
if(status != hipSuccess)
MIGRAPHX_THROW("Failed hipEventElapsedTime: " + hip_error(status));
}
return result;
}
private: private:
// TODO: Make this a vector to support multiple devices // TODO: Make this a vector to support multiple devices
std::shared_ptr<hip_device> current_device; std::shared_ptr<hip_device> current_device;
std::vector<shared<hip_event_ptr>> events; std::vector<shared<hip_event_ptr>> events;
bool measure_perf = false;
// for event perf timing
shared<hip_event_ptr> start_event = nullptr;
shared<hip_event_ptr> stop_event = nullptr;
// for stream syncronization
shared<hip_event_ptr> begin_event = nullptr;
shared<hip_event_ptr> finish_event = nullptr;
}; };
inline void migraphx_to_value(value& v, const context& ctx) { v = ctx.to_value(); } inline void migraphx_to_value(value& v, const context& ctx) { v = ctx.to_value(); }
......
...@@ -39,6 +39,10 @@ struct miopen_convolution ...@@ -39,6 +39,10 @@ struct miopen_convolution
op::convolution op; op::convolution op;
shared<convolution_descriptor> cd = nullptr; shared<convolution_descriptor> cd = nullptr;
miopenConvFwdAlgorithm_t algo{}; miopenConvFwdAlgorithm_t algo{};
#ifdef MIGRAPHX_HAS_FIND_2_API
value::binary solution_object{};
shared<miopen_solution> solution_ptr = nullptr;
#endif
uint64_t solution_id = 0; uint64_t solution_id = 0;
template <class Self, class F> template <class Self, class F>
...@@ -49,6 +53,9 @@ struct miopen_convolution ...@@ -49,6 +53,9 @@ struct miopen_convolution
f(self.op.dilation, "dilation"), f(self.op.dilation, "dilation"),
f(self.op.group, "group"), f(self.op.group, "group"),
f(self.op.padding_mode, "padding_mode"), f(self.op.padding_mode, "padding_mode"),
#ifdef MIGRAPHX_HAS_FIND_2_API
f(self.solution_object, "solution_object"),
#endif
f(self.solution_id, "solution_id")); f(self.solution_id, "solution_id"));
} }
...@@ -57,7 +64,7 @@ struct miopen_convolution ...@@ -57,7 +64,7 @@ struct miopen_convolution
argument argument
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const; compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs); shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs); void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& inputs);
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{ {
return shapes.size() - 1; return shapes.size() - 1;
......
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_RTGLIB_COSH_HPP
#define MIGRAPHX_GUARD_RTGLIB_COSH_HPP
#include <migraphx/gpu/oper.hpp>
#include <migraphx/gpu/device/cosh.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct hip_cosh : unary_device<hip_cosh, device::cosh>
{
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ACOSH_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ACOSH_HPP
#include <migraphx/argument.hpp>
#include <migraphx/config.hpp>
#include <hip/hip_runtime_api.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void acosh(hipStream_t stream, const argument& result, const argument& arg);
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_HPP
#include <migraphx/argument.hpp>
#include <migraphx/config.hpp>
#include <hip/hip_runtime_api.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void add(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2);
void add(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3);
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_CLIP_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_CLIP_HPP
#include <migraphx/argument.hpp>
#include <migraphx/config.hpp>
#include <hip/hip_runtime_api.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void add_clip(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& min_arg,
const argument& max_arg);
void add_clip(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3,
const argument& min_arg,
const argument& max_arg);
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_RELU_HPP
#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_RELU_HPP
#include <migraphx/argument.hpp>
#include <migraphx/config.hpp>
#include <hip/hip_runtime_api.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
namespace device {
void add_relu(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2);
void add_relu(hipStream_t stream,
const argument& result,
const argument& arg1,
const argument& arg2,
const argument& arg3);
} // namespace device
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment